From 4a32688a82c1f7d74b5067abac1ff037dcbdba20 Mon Sep 17 00:00:00 2001 From: Matthew Wells Date: Tue, 2 Apr 2024 14:54:14 -0500 Subject: [PATCH 001/119] updated tracked test data --- tests/data/clusters/expected_clusters.txt | 4 ++++ tests/data/clusters/expected_tree.nwk | 1 + tests/data/distances/expected_dists.tsv | 4 ++++ tests/data/profiles/expected-profile1.tsv | 4 ++++ tests/data/reports/sample1.mlst.json | 7 +++++++ tests/data/reports/sample2.mlst.json | 7 +++++++ tests/data/reports/sample3.mlst.json | 7 +++++++ tests/data/reports/sampleQ.mlst.json | 7 +++++++ tests/data/samplesheets/samplesheet1.csv | 5 +++++ 9 files changed, 46 insertions(+) create mode 100644 tests/data/clusters/expected_clusters.txt create mode 100644 tests/data/clusters/expected_tree.nwk create mode 100644 tests/data/distances/expected_dists.tsv create mode 100644 tests/data/profiles/expected-profile1.tsv create mode 100644 tests/data/reports/sample1.mlst.json create mode 100644 tests/data/reports/sample2.mlst.json create mode 100644 tests/data/reports/sample3.mlst.json create mode 100644 tests/data/reports/sampleQ.mlst.json create mode 100644 tests/data/samplesheets/samplesheet1.csv diff --git a/tests/data/clusters/expected_clusters.txt b/tests/data/clusters/expected_clusters.txt new file mode 100644 index 0000000..c4adfe5 --- /dev/null +++ b/tests/data/clusters/expected_clusters.txt @@ -0,0 +1,4 @@ +id address level_1 level_2 level_3 +sample1 1.1.1 1 1 1 +sample2 1.1.1 1 1 1 +sample3 2.2.2 2 2 2 diff --git a/tests/data/clusters/expected_tree.nwk b/tests/data/clusters/expected_tree.nwk new file mode 100644 index 0000000..a8cc370 --- /dev/null +++ b/tests/data/clusters/expected_tree.nwk @@ -0,0 +1 @@ +((sample2:0.000000,sample1:0.000000):16.666666666666668,sample3:33.333333); diff --git a/tests/data/distances/expected_dists.tsv b/tests/data/distances/expected_dists.tsv new file mode 100644 index 0000000..00e9cec --- /dev/null +++ b/tests/data/distances/expected_dists.tsv @@ -0,0 +1,4 @@ +dists sample1 sample2 sample3 +sample1 0.0 0.0 33.333333333333336 +sample2 0.0 0.0 33.333333333333336 +sample3 33.333333333333336 33.333333333333336 0.0 diff --git a/tests/data/profiles/expected-profile1.tsv b/tests/data/profiles/expected-profile1.tsv new file mode 100644 index 0000000..233f6e4 --- /dev/null +++ b/tests/data/profiles/expected-profile1.tsv @@ -0,0 +1,4 @@ +sample_id l1 l2 l3 +sample1 1 1 1 +sample2 1 1 1 +sample3 1 1 2 diff --git a/tests/data/reports/sample1.mlst.json b/tests/data/reports/sample1.mlst.json new file mode 100644 index 0000000..393f0ac --- /dev/null +++ b/tests/data/reports/sample1.mlst.json @@ -0,0 +1,7 @@ +{ + "sample1": { + "l1": "1", + "l2": "1", + "l3": "1" + } +} diff --git a/tests/data/reports/sample2.mlst.json b/tests/data/reports/sample2.mlst.json new file mode 100644 index 0000000..9af0a4c --- /dev/null +++ b/tests/data/reports/sample2.mlst.json @@ -0,0 +1,7 @@ +{ + "sample2": { + "l1": "1", + "l2": "1", + "l3": "1" + } +} diff --git a/tests/data/reports/sample3.mlst.json b/tests/data/reports/sample3.mlst.json new file mode 100644 index 0000000..88c3d0c --- /dev/null +++ b/tests/data/reports/sample3.mlst.json @@ -0,0 +1,7 @@ +{ + "sample3": { + "l1": "1", + "l2": "1", + "l3": "2" + } +} diff --git a/tests/data/reports/sampleQ.mlst.json b/tests/data/reports/sampleQ.mlst.json new file mode 100644 index 0000000..c6cca43 --- /dev/null +++ b/tests/data/reports/sampleQ.mlst.json @@ -0,0 +1,7 @@ +{ + "sampleQ": { + "l1": "1", + "l2": "2", + "l3": "1" + } +} diff --git a/tests/data/samplesheets/samplesheet1.csv b/tests/data/samplesheets/samplesheet1.csv new file mode 100644 index 0000000..8b36335 --- /dev/null +++ b/tests/data/samplesheets/samplesheet1.csv @@ -0,0 +1,5 @@ +sample,profile_type,mlst_alleles +sampleQ,false,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sampleQ.mlst.json +sample1,true,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample1.mlst.json +sample2,true,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample2.mlst.json +sample3,true,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample3.mlst.json From 3ef255b55d7c79a43fc8c10d5fcb22fb6c2c20c6 Mon Sep 17 00:00:00 2001 From: Matthew Wells Date: Tue, 2 Apr 2024 16:09:17 -0500 Subject: [PATCH 002/119] Created initial workflow Implemented the initial workflow, it is likely that many changes will need to be made. However I am focusing on implementing working unit-tests for the time being --- tests/data/called/expected_results.txt | 5 +++++ tests/data/distances/expected_pairwise_dists.txt | 4 ++++ 2 files changed, 9 insertions(+) create mode 100644 tests/data/called/expected_results.txt create mode 100644 tests/data/distances/expected_pairwise_dists.txt diff --git a/tests/data/called/expected_results.txt b/tests/data/called/expected_results.txt new file mode 100644 index 0000000..8a3eec9 --- /dev/null +++ b/tests/data/called/expected_results.txt @@ -0,0 +1,5 @@ +id address level_1 level_2 level_3 +sample1 1.1.1 1 1 1 +sample2 1.1.1 1 1 1 +sample3 2.2.2 2 2 2 +sampleQ 1.1.1 1 1 1 diff --git a/tests/data/distances/expected_pairwise_dists.txt b/tests/data/distances/expected_pairwise_dists.txt new file mode 100644 index 0000000..df58510 --- /dev/null +++ b/tests/data/distances/expected_pairwise_dists.txt @@ -0,0 +1,4 @@ +query_id ref_id dist +sampleQ sample1 0.0 +sampleQ sample2 33.333333333333336 +sampleQ sample3 66.66666666666667 From 40e41c87896013c10de4350cea19ab27b297e539 Mon Sep 17 00:00:00 2001 From: Matthew Wells Date: Tue, 2 Apr 2024 16:23:47 -0500 Subject: [PATCH 003/119] added initial unit tests I added some unit tests to the pipeline however these tests are incomplete, and show some current work that needs to be done in terms of altering the output paths as they are quite messy currently. Additionally more tests with the existing data can be added as it woul really improve the robustness of the pipeline --- tests/nextflow.config | 13 ++++++++++++ tests/pipelines/main.nf.test | 40 ++++++++++++++++++++++++++++++++++++ 2 files changed, 53 insertions(+) create mode 100644 tests/pipelines/main.nf.test diff --git a/tests/nextflow.config b/tests/nextflow.config index c19b1ad..ff66ccb 100644 --- a/tests/nextflow.config +++ b/tests/nextflow.config @@ -3,3 +3,16 @@ Nextflow config file for running tests ======================================================================================== */ + + +params.max_memory = "2.GB" +params.max_cpus = 1 +params.ref_clusters = "$baseDir/tests/data/expected_clusters.txt" + + +/* This is required to run in WSL/Ubuntu using singularity +Without this, profile_dists was not successfully completing +due to issues with multiprocessing in the container. A similar +error is found at https://github.com/marcelm/cutadapt/issues/583 +*/ +singularity.runOptions = "--contain" diff --git a/tests/pipelines/main.nf.test b/tests/pipelines/main.nf.test new file mode 100644 index 0000000..7370ad0 --- /dev/null +++ b/tests/pipelines/main.nf.test @@ -0,0 +1,40 @@ +nextflow_pipeline { + + name "Integration test of nomenclature assignment pipeline" + script "main.nf" + + test("Small-scale test of full pipeline"){ + tag "pipeline" + + when{ + params { + input = "$baseDir/tests/data/samplesheets/samplesheet1.csv" + outdir = "results" + } + } + + then { + assert workflow.success + assert path("$launchDir/results").exists() + + // Check merged profiles + // TODO check query profile is merged + def actual_profile_ref = path("$launchDir/results/locidex/merged/query/merged_value/merged_profiles_value.tsv") + def expected_profile_tsv = path("$baseDir/tests/data/profiles/expected-profile1.tsv") + assert actual_profile_ref.text == expected_profile_tsv.text + + + // Check computed pairwise distances + def actual_distances = path("$launchDir/results/distances/results.text") + def expected_distances = path("$baseDir/tests/data/distances/expected_pairwise_dists.tsv") + assert actual_distances.text == expected_distances.text + + // Check called clusters + def actual_calls = path("$launchDir/results/call/Called/results.text") + def expected_calls = path("$baseDir/tests/data/called/expected_results.txt") + assert actual_calls.text == expected_calls.text + } + } + + +} From f4d2887fb633be0b1bf6857d72db0cc294193e1e Mon Sep 17 00:00:00 2001 From: Matthew Wells Date: Tue, 2 Apr 2024 16:27:17 -0500 Subject: [PATCH 004/119] removed module tests --- conf/modules.config | 49 ++++++++++++- modules/local/gas/call/main.nf | 13 ++-- modules/local/locidex/merge/main.nf | 8 ++- modules/local/profile_dists/main.nf | 23 ++++--- nextflow.config | 17 +++++ tests/modules/local/assemblystub/main.nf.test | 38 ---------- .../local/generatesamplejson/main.nf.test | 40 ----------- .../local/generatesummary/main.nf.test | 37 ---------- .../local/iridanextoutput/main.nf.test | 51 -------------- .../local/simplifyiridajson/main.nf.test | 41 ----------- workflows/gas_nomenclature.nf | 69 ++++++++++++++++--- 11 files changed, 148 insertions(+), 238 deletions(-) delete mode 100644 tests/modules/local/assemblystub/main.nf.test delete mode 100644 tests/modules/local/generatesamplejson/main.nf.test delete mode 100644 tests/modules/local/generatesummary/main.nf.test delete mode 100644 tests/modules/local/iridanextoutput/main.nf.test delete mode 100644 tests/modules/local/simplifyiridajson/main.nf.test diff --git a/conf/modules.config b/conf/modules.config index 08fc284..86c0455 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -15,8 +15,11 @@ process { // Publish directory names assembly_directory_name = "assembly" summary_directory_name = "summary" + profile_dists_directory_name = "distances" + gas_call_directory_name = "call" - locidex_merge_directory_name = [params.outdir , "locidex", "merge"].join(File.separator) + locidex_merge_ref_directory_name = [params.outdir , "locidex", "merge", "reference"].join(File.separator) + locidex_merge_query_directory_name = [params.outdir , "locidex", "merge", "query"].join(File.separator) publishDir = [ path: { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" }, @@ -56,15 +59,55 @@ process { ] } - withName: LOCIDEX_MERGE { + withName: LOCIDEX_MERGE_REF { publishDir = [ - path: locidex_merge_directory_name, + path: locidex_merge_ref_directory_name, mode: params.publish_dir_mode, pattern: "*/*", saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] } + withName: LOCIDEX_MERGE_QUERY { + publishDir = [ + path: locidex_merge_query_directory_name, + mode: params.publish_dir_mode, + pattern: "*/*", + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + + withName: PROFILE_DISTS { + publishDir = [ + path: { ["${params.outdir}", "${task.profile_dists_directory_name}"].join(File.separator) }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : + filename.contains(File.separator) ? filename.split(File.separator)[-1] : filename } + ] + } + + withName: GAS_CALL { + publishDir = [ + [ + path: { ["${params.outdir}", "${task.gas_call_directory_name}"].join(File.separator) }, + mode: params.publish_dir_mode, + pattern: "*/thresholds.json" + ], + [ + path: { ["${params.outdir}", "${task.gas_call_directory_name}"].join(File.separator) }, + mode: params.publish_dir_mode, + pattern: "*/results.{text,parquet}" + ], + [ + path: { ["${params.outdir}", "${task.gas_call_directory_name}"].join(File.separator) }, + mode: params.publish_dir_mode, + pattern: "*/run.json" + ] + ] + } + + withName: CUSTOM_DUMPSOFTWAREVERSIONS { publishDir = [ path: { "${params.outdir}/pipeline_info" }, diff --git a/modules/local/gas/call/main.nf b/modules/local/gas/call/main.nf index 1fe2c64..33db7a7 100644 --- a/modules/local/gas/call/main.nf +++ b/modules/local/gas/call/main.nf @@ -2,7 +2,7 @@ process GAS_CALL{ label "process_high" - tag "Calling: ${meta.id}" + tag "Assigning Nomenclature" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/genomic_address_service%3A0.1.1--pyh7cba7a3_1' : @@ -10,17 +10,18 @@ process GAS_CALL{ input: - tuple val(meta), path(reference_clusters), path(distances) + path(reference_clusters) + path(distances) output: - tuple val(meta), path("${prefix}/results.{text,parquet}"), emit: distances, optional: true - tuple val(meta), path("${prefix}/thresholds.json"), emit: thresholds - tuple val(meta), path("${prefix}/run.json"), emit: run + path("${prefix}/results.{text,parquet}"), emit: distances, optional: true + path("${prefix}/thresholds.json"), emit: thresholds + path("${prefix}/run.json"), emit: run path "versions.yml", emit: versions script: // Need to add more args for gas call below - prefix = meta.id + prefix = "Called" """ gas call --dists $distances \\ --rclusters $reference_clusters \\ diff --git a/modules/local/locidex/merge/main.nf b/modules/local/locidex/merge/main.nf index bd9f3e8..b58b154 100644 --- a/modules/local/locidex/merge/main.nf +++ b/modules/local/locidex/merge/main.nf @@ -9,17 +9,19 @@ process LOCIDEX_MERGE { 'quay.io/biocontainers/locidex:0.1.1--pyhdfd78af_0' }" input: - val input_values // [file(sample1), file(sample2), file(sample3), etc...] + path input_values // [file(sample1), file(sample2), file(sample3), etc...] + val input_tag // makes output unique and denotes the item as the reference or query to preven name collision output: path("${combined_dir}/*.tsv"), emit: combined_profiles - path("${combined_dir}/*.json"), emit: report path "versions.yml", emit: versions script: - combined_dir = "merged" + combined_dir = "merged_${input_tag}" """ locidex merge -i ${input_values.join(' ')} -o ${combined_dir} + + mv ${combined_dir}/*.tsv ${combined_dir}/merged_profiles_${input_tag}.tsv cat <<-END_VERSIONS > versions.yml "${task.process}": locidex merge: \$(echo \$(locidex search -V 2>&1) | sed 's/^.*locidex //' ) diff --git a/modules/local/profile_dists/main.nf b/modules/local/profile_dists/main.nf index 2e48a02..b7a0933 100644 --- a/modules/local/profile_dists/main.nf +++ b/modules/local/profile_dists/main.nf @@ -1,24 +1,25 @@ process PROFILE_DISTS{ label "process_high" - tag "Pairwise Distance Generation: ${meta.id}" + tag "Gathering Distances Between Reference and Query Profiles" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/profile_dists%3A1.0.0--pyh7cba7a3_0' : 'quay.io/biocontainers/profile_dists:1.0.0--pyh7cba7a3_0' }" input: - tuple val(meta), path(query), path(ref) + path query + path ref val mapping_format - path(mapping_file) - path(columns) + path mapping_file + path columns output: - tuple val(meta), path("${prefix}_${mapping_format}/allele_map.json"), emit: allele_map - tuple val(meta), path("${prefix}_${mapping_format}/query_profile.{text,parquet}"), emit: query_profile - tuple val(meta), path("${prefix}_${mapping_format}/ref_profile.{text,parquet}"), emit: ref_profile - tuple val(meta), path("${prefix}_${mapping_format}/results.{text,parquet}"), emit: results - tuple val(meta), path("${prefix}_${mapping_format}/run.json"), emit: run + path("${prefix}/allele_map.json"), emit: allele_map + path("${prefix}/query_profile.{text,parquet}"), emit: query_profile + path("${prefix}/ref_profile.{text,parquet}"), emit: ref_profile + path("${prefix}/results.{text,parquet}"), emit: results + path("${prefix}/run.json"), emit: run path "versions.yml", emit: versions @@ -41,7 +42,7 @@ process PROFILE_DISTS{ args = args + " --count_missing" } // --match_threshold $params.profile_dists.match_thresh \\ - prefix = meta.id + prefix = "distances_${mapping_format}" """ profile_dists --query $query --ref $ref $args --outfmt $mapping_format \\ --distm $params.pd_distm \\ @@ -50,7 +51,7 @@ process PROFILE_DISTS{ --sample_qual_thresh $params.pd_sample_quality_threshold \\ --max_mem ${task.memory.toGiga()} \\ --cpus ${task.cpus} \\ - -o ${prefix}_${mapping_format} + -o ${prefix} cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/nextflow.config b/nextflow.config index bddc99e..89da1c6 100644 --- a/nextflow.config +++ b/nextflow.config @@ -47,6 +47,23 @@ params { validate_params = true // Profile Dists + pd_outfmt = "pairwise" + pd_distm = "scaled" + pd_missing_threshold = 1.0 + pd_sample_quality_threshold = 1.0 + pd_match_threshold = -1.0 + pd_file_type = "text" + pd_mapping_file = null // default is no file + pd_force = false + pd_skip = false + pd_columns = null + pd_count_missing = true + + + // GAS Call + gm_thresholds = "10,5,0" + gm_delimiter = "'.'" // note the single quotes surrounding the delimiter + ref_clusters = "" } diff --git a/tests/modules/local/assemblystub/main.nf.test b/tests/modules/local/assemblystub/main.nf.test deleted file mode 100644 index 881bf56..0000000 --- a/tests/modules/local/assemblystub/main.nf.test +++ /dev/null @@ -1,38 +0,0 @@ -nextflow_process { - - name "Test Process ASSEMBLY_STUB" - script "modules/local/assemblystub/main.nf" - process "ASSEMBLY_STUB" - - test("Basic execution, check output.") { - - when { - params { - outdir = "tests/results" - } - process { - """ - input[0] = new Tuple(["id": "SAMPLE1"], [file("sample1_R1.fastq.gz"), file("sample1_R2.fastq.gz")]) - """ - } - } - - then { - assert process.success - - with(process.out) { - // check if emitted output has been created - assert assembly.size() == 1 - - // parse assembly file - def assembly_header = path(assembly.get(0)[1]).linesGzip[0] - def assembly_body = path(assembly.get(0)[1]).linesGzip[1] - - assert assembly_header.equals(">SAMPLE1-stub-assembly") - assert assembly_body.equals("ACGTAACCGGTTAAACCCGGGTTTAAAACCCCGGGGTTTTAAAAACCCCCGGGGGTTTTT") - } - } - - } - -} diff --git a/tests/modules/local/generatesamplejson/main.nf.test b/tests/modules/local/generatesamplejson/main.nf.test deleted file mode 100644 index ac071a3..0000000 --- a/tests/modules/local/generatesamplejson/main.nf.test +++ /dev/null @@ -1,40 +0,0 @@ -nextflow_process { - - name "Test Process GENERATE_SAMPLE_JSON" - script "modules/local/generatesamplejson/main.nf" - process "GENERATE_SAMPLE_JSON" - - test("Basic execution, check output.") { - - when { - params { - outdir = "tests/results" - } - process { - """ - input[0] = new Tuple(["id": "SAMPLE1"], [file("sample1_R1.fastq.gz"), file("sample1_R2.fastq.gz")], file("SAMPLE1.assembly.fa.gz")) - """ - } - } - - then { - assert process.success - - with(process.out) { - // check if emitted output has been created - assert json.size() == 1 - - // parse output json file - def sample_json_string = path(json.get(0)[1]).linesGzip.join("\n") - def parser = new groovy.json.JsonSlurper() - def sample_json = parser.parseText(sample_json_string) - - assert sample_json.files.samples.SAMPLE1[0].path.equals("assembly/SAMPLE1.assembly.fa.gz") - assert sample_json.metadata.samples.SAMPLE1.reads[0].equals("sample1_R1.fastq.gz") - assert sample_json.metadata.samples.SAMPLE1.reads[1].equals("sample1_R2.fastq.gz") - } - } - - } - -} diff --git a/tests/modules/local/generatesummary/main.nf.test b/tests/modules/local/generatesummary/main.nf.test deleted file mode 100644 index b2eb189..0000000 --- a/tests/modules/local/generatesummary/main.nf.test +++ /dev/null @@ -1,37 +0,0 @@ -nextflow_process { - - name "Test Process GENERATE_SUMMARY" - script "modules/local/generatesummary/main.nf" - process "GENERATE_SUMMARY" - - test("Basic execution, check output.") { - - when { - params { - outdir = "tests/results" - } - process { - """ - input[0] = [new Tuple(["id": "SAMPLE1"], [file("sample1_R1.fastq.gz"), file("sample1_R2.fastq.gz")], file("SAMPLE1.assembly.fa.gz"))] - """ - } - } - - then { - assert process.success - - with(process.out) { - // check if emitted output has been created - assert summary.size() == 1 - - assert path(summary.get(0)).linesGzip[0].equals("IRIDANEXTEXAMPLE Pipeline Summary") - assert path(summary.get(0)).linesGzip[4].equals("SAMPLE1:") - assert path(summary.get(0)).linesGzip[5].contains("reads.1: ") - assert path(summary.get(0)).linesGzip[6].contains("reads.2: ") - assert path(summary.get(0)).linesGzip[7].contains("assembly: ") - } - } - - } - -} diff --git a/tests/modules/local/iridanextoutput/main.nf.test b/tests/modules/local/iridanextoutput/main.nf.test deleted file mode 100644 index 72808ab..0000000 --- a/tests/modules/local/iridanextoutput/main.nf.test +++ /dev/null @@ -1,51 +0,0 @@ -nextflow_process { - - name "Test Process IRIDA_NEXT_OUTPUT" - script "modules/local/iridanextoutput/main.nf" - process "IRIDA_NEXT_OUTPUT" - - test("Basic execution, check output.") { - - when { - params { - outdir = "tests/results" - } - process { - """ - input[0] = [file("$baseDir/tests/data/SAMPLE1.simple.json.gz"), file("$baseDir/tests/data/SAMPLE2.simple.json.gz"), file("$baseDir/tests/data/SAMPLE3.simple.json.gz")] - """ - } - } - - then { - assert process.success - - with(process.out) { - // check if emitted output has been created - assert output_json.size() == 1 - - // parse output json file - def json_string = path(output_json.get(0)).linesGzip.join("\n") - def parser = new groovy.json.JsonSlurper() - def irida_json = parser.parseText(json_string) - - assert irida_json.files.global[0].path.equals("summary/summary.txt.gz") - - assert irida_json.files.samples.SAMPLE1[0].path.equals("assembly/SAMPLE1.assembly.fa.gz") - assert irida_json.files.samples.SAMPLE2[0].path.equals("assembly/SAMPLE2.assembly.fa.gz") - assert irida_json.files.samples.SAMPLE3[0].path.equals("assembly/SAMPLE3.assembly.fa.gz") - - assert irida_json.metadata.samples.SAMPLE1.'reads.1'.equals("sample1_R1.fastq.gz") - assert irida_json.metadata.samples.SAMPLE1.'reads.2'.equals("sample1_R2.fastq.gz") - - assert irida_json.metadata.samples.SAMPLE2.'reads.1'.equals("sample2_R1.fastq.gz") - assert irida_json.metadata.samples.SAMPLE2.'reads.2'.equals("sample2_R2.fastq.gz") - - assert irida_json.metadata.samples.SAMPLE3.'reads.1'.equals("sample1_R1.fastq.gz") - assert irida_json.metadata.samples.SAMPLE3.'reads.2'.equals("null") - } - } - - } - -} diff --git a/tests/modules/local/simplifyiridajson/main.nf.test b/tests/modules/local/simplifyiridajson/main.nf.test deleted file mode 100644 index 7d61567..0000000 --- a/tests/modules/local/simplifyiridajson/main.nf.test +++ /dev/null @@ -1,41 +0,0 @@ -nextflow_process { - - name "Test Process SIMPLIFY_IRIDA_JSON" - script "modules/local/simplifyiridajson/main.nf" - process "SIMPLIFY_IRIDA_JSON" - - test("Basic execution, check output.") { - - when { - params { - outdir = "tests/results" - } - process { - """ - input[0] = new Tuple(["id": "SAMPLE1"], file("$baseDir/tests/data/SAMPLE1.json.gz")) - """ - } - } - - then { - assert process.success - - with(process.out) { - // check if emitted output has been created - assert simple_json.size() == 1 - - // parse output json file - def json_string = path(simple_json.get(0)[1]).linesGzip.join("\n") - def parser = new groovy.json.JsonSlurper() - def json_simple = parser.parseText(json_string) - - assert json_simple.files.samples.SAMPLE1[0].path.equals("assembly/SAMPLE1.assembly.fa.gz") - - assert json_simple.metadata.samples.SAMPLE1.'reads.1'.equals("sample1_R1.fastq.gz") - assert json_simple.metadata.samples.SAMPLE1.'reads.2'.equals("sample1_R2.fastq.gz") - } - } - - } - -} diff --git a/workflows/gas_nomenclature.nf b/workflows/gas_nomenclature.nf index 453a922..f9a6572 100644 --- a/workflows/gas_nomenclature.nf +++ b/workflows/gas_nomenclature.nf @@ -48,6 +48,22 @@ include { PROFILE_DISTS } from "../modules/local/profile_dists/main" ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ + +def prepareFilePath(String filep){ + // Rerturns null if a file is not valid + def return_path = null + if(filep){ + file_in = path(filep) + if(file_in.exists()){ + return_path = file_in + } + }else{ + return_path = [] + } + + return return_path // empty value if file argument is null +} + workflow GAS_NOMENCLATURE { ch_versions = Channel.empty() @@ -62,11 +78,47 @@ workflow GAS_NOMENCLATURE { } reference_values = profiles.ref.collect{ meta, profile -> profile} - query_values = profile.query.collect{ meta, profile -> proifile } - reference_values.view() - query_values.view() - //LOCIDEX_MERGE_REF(reference_values) - //LOCIDEX_MERGE_QUERY(query_values) + query_values = profiles.query.collect{ meta, profile -> profile } + + // LOCIDEX modules + ref_tag = Channel.value("ref") + query_tag = Channel.value("value") + merged_references = LOCIDEX_MERGE_REF(reference_values, ref_tag) + ch_versions = ch_versions.mix(merged_references.versions) + + merged_queries = LOCIDEX_MERGE_QUERY(query_values, query_tag) + ch_versions = ch_versions.mix(merged_queries.versions) + + + // PROFILE DISTS processes + + mapping_file = prepareFilePath(params.pd_mapping_file) + if(mapping_file == null){ + exit 1, "${params.pd_mapping_file}: Does not exist but was passed to the pipeline. Exiting now." + } + + + columns_file = prepareFilePath(params.pd_columns) + if(columns_file == null){ + exit 1, "${params.pd_columns}: Does not exist but was passed to the pipeline. Exiting now." + } + + mapping_format = Channel.value(params.pd_outfmt) + + distances = PROFILE_DISTS(merged_queries.combined_profiles, + merged_references.combined_profiles, + mapping_format, + mapping_file, + columns_file) + + ch_versions = ch_versions.mix(distances.versions) + + // GAS CALL + + clusters = Channel.fromPath(params.ref_clusters, checkIfExists: true) + called_data = GAS_CALL(clusters, distances.results) + + ch_versions = ch_versions.mix(called_data.versions) // A channel of tuples of ({meta}, [read[0], read[1]], assembly) @@ -93,9 +145,10 @@ workflow GAS_NOMENCLATURE { //) //ch_versions = ch_versions.mix(IRIDA_NEXT_OUTPUT.out.versions) - //CUSTOM_DUMPSOFTWAREVERSIONS ( - // ch_versions.unique().collectFile(name: 'collated_versions.yml') - //) + CUSTOM_DUMPSOFTWAREVERSIONS ( + ch_versions.unique().collectFile(name: 'collated_versions.yml') + ) + } From bebcbb33736f42fb1768349ac6b6e89803695bce Mon Sep 17 00:00:00 2001 From: Matthew Wells Date: Tue, 2 Apr 2024 16:29:24 -0500 Subject: [PATCH 005/119] updated expected clusters path --- tests/nextflow.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/nextflow.config b/tests/nextflow.config index ff66ccb..672dc69 100644 --- a/tests/nextflow.config +++ b/tests/nextflow.config @@ -7,7 +7,7 @@ params.max_memory = "2.GB" params.max_cpus = 1 -params.ref_clusters = "$baseDir/tests/data/expected_clusters.txt" +params.ref_clusters = "$baseDir/tests/data/clusters/expected_clusters.txt" /* This is required to run in WSL/Ubuntu using singularity From 2b8de972322d37c2a13e0ce7dee34c5e0bad89fc Mon Sep 17 00:00:00 2001 From: Matthew Wells Date: Tue, 2 Apr 2024 16:56:19 -0500 Subject: [PATCH 006/119] updated test configs --- conf/test.config | 10 +++++++++- conf/test_full.config | 10 +++++++++- 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/conf/test.config b/conf/test.config index 0e0b591..b0061d6 100644 --- a/conf/test.config +++ b/conf/test.config @@ -20,5 +20,13 @@ params { max_time = '1.h' // Input data - input = 'https://raw.githubusercontent.com/phac-nml/gasnomenclature/main/assets/samplesheet.csv' + input = 'https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/samplesheets/samplesheet1.csv' } + + +/* This is required to run in WSL/Ubuntu using singularity +Without this, profile_dists was not successfully completing +due to issues with multiprocessing in the container. A similar +error is found at https://github.com/marcelm/cutadapt/issues/583 +*/ +singularity.runOptions = "--contain" diff --git a/conf/test_full.config b/conf/test_full.config index c8b5764..0981a15 100644 --- a/conf/test_full.config +++ b/conf/test_full.config @@ -15,5 +15,13 @@ params { config_profile_description = 'Full test dataset to check pipeline function' // Input data for full size test - input = 'https://raw.githubusercontent.com/phac-nml/gasnomenclature/main/assets/samplesheet.csv' + input = 'https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/samplesheets/samplesheet1.csv' } + +/* This is required to run in WSL/Ubuntu using singularity +Without this, profile_dists was not successfully completing +due to issues with multiprocessing in the container. A similar +error is found at https://github.com/marcelm/cutadapt/issues/583 +*/ +singularity.runOptions = "--contain" + From e8a73736cfd89dcf1d34f0e752f2ae2dff290b39 Mon Sep 17 00:00:00 2001 From: Matthew Wells Date: Wed, 3 Apr 2024 09:10:02 -0500 Subject: [PATCH 007/119] updated nftest.config Its likely that the binaries for the individual tools were not being found as the nf-test.config file was not configured to use docker. --- nf-test.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nf-test.config b/nf-test.config index 870799d..2fa82ad 100644 --- a/nf-test.config +++ b/nf-test.config @@ -3,6 +3,6 @@ config { testsDir "tests" workDir ".nf-test" configFile "tests/nextflow.config" - profile "" + profile "docker" } From b64bb3d1ae461cd4a426a761454aa9679e836afd Mon Sep 17 00:00:00 2001 From: Matthew Wells Date: Wed, 3 Apr 2024 09:14:22 -0500 Subject: [PATCH 008/119] updated test outpu path --- tests/pipelines/main.nf.test | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/pipelines/main.nf.test b/tests/pipelines/main.nf.test index 7370ad0..059be1f 100644 --- a/tests/pipelines/main.nf.test +++ b/tests/pipelines/main.nf.test @@ -19,7 +19,7 @@ nextflow_pipeline { // Check merged profiles // TODO check query profile is merged - def actual_profile_ref = path("$launchDir/results/locidex/merged/query/merged_value/merged_profiles_value.tsv") + def actual_profile_ref = path("$launchDir/results/locidex/merge/query/merged_value/merged_profiles_value.tsv") def expected_profile_tsv = path("$baseDir/tests/data/profiles/expected-profile1.tsv") assert actual_profile_ref.text == expected_profile_tsv.text From f1756e9d6b824d38f2d3ff97b75227d7d7c698a9 Mon Sep 17 00:00:00 2001 From: Matthew Wells Date: Wed, 3 Apr 2024 09:19:25 -0500 Subject: [PATCH 009/119] updated test output paths --- tests/pipelines/main.nf.test | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/pipelines/main.nf.test b/tests/pipelines/main.nf.test index 059be1f..de784af 100644 --- a/tests/pipelines/main.nf.test +++ b/tests/pipelines/main.nf.test @@ -19,7 +19,7 @@ nextflow_pipeline { // Check merged profiles // TODO check query profile is merged - def actual_profile_ref = path("$launchDir/results/locidex/merge/query/merged_value/merged_profiles_value.tsv") + def actual_profile_ref = path("$launchDir/results/locidex/merge/reference/merged_ref/merged_profiles_ref.tsv") def expected_profile_tsv = path("$baseDir/tests/data/profiles/expected-profile1.tsv") assert actual_profile_ref.text == expected_profile_tsv.text From 7c1e516029aedfab36c2335bde80b0e3a118e6df Mon Sep 17 00:00:00 2001 From: Matthew Wells Date: Wed, 3 Apr 2024 09:22:58 -0500 Subject: [PATCH 010/119] fixed test data line endings --- tests/data/profiles/expected-profile1.tsv | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/data/profiles/expected-profile1.tsv b/tests/data/profiles/expected-profile1.tsv index 233f6e4..9b938e1 100644 --- a/tests/data/profiles/expected-profile1.tsv +++ b/tests/data/profiles/expected-profile1.tsv @@ -1,4 +1,4 @@ -sample_id l1 l2 l3 -sample1 1 1 1 -sample2 1 1 1 -sample3 1 1 2 +sample_id l1 l2 l3 +sample1 1 1 1 +sample2 1 1 1 +sample3 1 1 2 From bdfe6c3c9ee6d38e1c0de6074c01168b9c2642ba Mon Sep 17 00:00:00 2001 From: Matthew Wells Date: Wed, 3 Apr 2024 09:30:58 -0500 Subject: [PATCH 011/119] updated paths to expected input files --- tests/pipelines/main.nf.test | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/pipelines/main.nf.test b/tests/pipelines/main.nf.test index de784af..911af92 100644 --- a/tests/pipelines/main.nf.test +++ b/tests/pipelines/main.nf.test @@ -26,7 +26,7 @@ nextflow_pipeline { // Check computed pairwise distances def actual_distances = path("$launchDir/results/distances/results.text") - def expected_distances = path("$baseDir/tests/data/distances/expected_pairwise_dists.tsv") + def expected_distances = path("$baseDir/tests/data/distances/expected_pairwise_dists.txt") assert actual_distances.text == expected_distances.text // Check called clusters From 1515c374ffc5cad27d38ffaadd429cdff1958061 Mon Sep 17 00:00:00 2001 From: Matthew Wells Date: Wed, 3 Apr 2024 09:35:45 -0500 Subject: [PATCH 012/119] updated test conf file input paths --- conf/test.config | 2 +- conf/test_full.config | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/conf/test.config b/conf/test.config index b0061d6..a8b10b4 100644 --- a/conf/test.config +++ b/conf/test.config @@ -20,7 +20,7 @@ params { max_time = '1.h' // Input data - input = 'https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/samplesheets/samplesheet1.csv' + input = 'https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/samplesheets/samplesheet1.csv' } diff --git a/conf/test_full.config b/conf/test_full.config index 0981a15..9df6034 100644 --- a/conf/test_full.config +++ b/conf/test_full.config @@ -15,7 +15,7 @@ params { config_profile_description = 'Full test dataset to check pipeline function' // Input data for full size test - input = 'https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/samplesheets/samplesheet1.csv' + input = 'https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/samplesheets/samplesheet1.csv' } /* This is required to run in WSL/Ubuntu using singularity From fbd6117949d03280968ed43deb21e28a5fb49d26 Mon Sep 17 00:00:00 2001 From: Matthew Wells Date: Wed, 3 Apr 2024 09:39:40 -0500 Subject: [PATCH 013/119] updated params in test configs --- conf/test.config | 1 + conf/test_full.config | 1 + 2 files changed, 2 insertions(+) diff --git a/conf/test.config b/conf/test.config index a8b10b4..9c17e75 100644 --- a/conf/test.config +++ b/conf/test.config @@ -21,6 +21,7 @@ params { // Input data input = 'https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/samplesheets/samplesheet1.csv' + ref_clusters = 'https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/clusters/expected_clusters.txt' } diff --git a/conf/test_full.config b/conf/test_full.config index 9df6034..4133c10 100644 --- a/conf/test_full.config +++ b/conf/test_full.config @@ -16,6 +16,7 @@ params { // Input data for full size test input = 'https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/samplesheets/samplesheet1.csv' + ref_clusters = 'https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/clusters/expected_clusters.txt' } /* This is required to run in WSL/Ubuntu using singularity From ca448b5ca5e5d804580962f80a1baf4f2b169ddb Mon Sep 17 00:00:00 2001 From: Matthew Wells Date: Wed, 3 Apr 2024 11:00:46 -0500 Subject: [PATCH 014/119] updated schema, and test file line endings --- nextflow_schema.json | 77 ++++++++++++++++++++++++++-- tests/data/reports/sample1.mlst.json | 14 ++--- tests/data/reports/sample2.mlst.json | 14 ++--- tests/data/reports/sample3.mlst.json | 14 ++--- workflows/gas_nomenclature.nf | 9 ++-- 5 files changed, 98 insertions(+), 30 deletions(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index 8639c86..71d1fac 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -10,7 +10,10 @@ "type": "object", "fa_icon": "fas fa-terminal", "description": "Define where the pipeline should find input data and save output data.", - "required": ["input", "outdir"], + "required": [ + "input", + "outdir" + ], "properties": { "input": { "type": "string", @@ -41,7 +44,11 @@ "default": "stub", "fa_icon": "fas fa-desktop", "description": "The sequence assembler to use for sequence assembly.", - "enum": ["default", "stub", "experimental"] + "enum": [ + "default", + "stub", + "experimental" + ] }, "random_seed": { "type": "integer", @@ -152,7 +159,14 @@ "description": "Method used to save pipeline results to output directory.", "help_text": "The Nextflow `publishDir` option specifies which intermediate files should be saved to the output directory. This option tells the pipeline what method should be used to move these files. See [Nextflow docs](https://www.nextflow.io/docs/latest/process.html#publishdir) for details.", "fa_icon": "fas fa-copy", - "enum": ["symlink", "rellink", "link", "copy", "copyNoFollow", "move"], + "enum": [ + "symlink", + "rellink", + "link", + "copy", + "copyNoFollow", + "move" + ], "hidden": true }, "email_on_fail": { @@ -226,5 +240,58 @@ { "$ref": "#/definitions/generic_options" } - ] -} + ], + "properties": { + "pd_outfmt": { + "type": "string", + "default": "pairwise" + }, + "pd_distm": { + "type": "string", + "default": "scaled" + }, + "pd_missing_threshold": { + "type": "integer", + "default": 1 + }, + "pd_sample_quality_threshold": { + "type": "integer", + "default": 1 + }, + "pd_match_threshold": { + "type": "integer", + "default": -1 + }, + "pd_file_type": { + "type": "string", + "default": "text" + }, + "pd_mapping_file": { + "type": "string" + }, + "pd_force": { + "type": "boolean" + }, + "pd_skip": { + "type": "boolean" + }, + "pd_columns": { + "type": "string" + }, + "pd_count_missing": { + "type": "boolean", + "default": true + }, + "gm_thresholds": { + "type": "string", + "default": "10,5,0" + }, + "gm_delimiter": { + "type": "string", + "default": "\\'.\\" + }, + "ref_clusters": { + "type": "string" + } + } +} \ No newline at end of file diff --git a/tests/data/reports/sample1.mlst.json b/tests/data/reports/sample1.mlst.json index 393f0ac..01bc774 100644 --- a/tests/data/reports/sample1.mlst.json +++ b/tests/data/reports/sample1.mlst.json @@ -1,7 +1,7 @@ -{ - "sample1": { - "l1": "1", - "l2": "1", - "l3": "1" - } -} +{ + "sample1": { + "l1": "1", + "l2": "1", + "l3": "1" + } +} diff --git a/tests/data/reports/sample2.mlst.json b/tests/data/reports/sample2.mlst.json index 9af0a4c..7c0426c 100644 --- a/tests/data/reports/sample2.mlst.json +++ b/tests/data/reports/sample2.mlst.json @@ -1,7 +1,7 @@ -{ - "sample2": { - "l1": "1", - "l2": "1", - "l3": "1" - } -} +{ + "sample2": { + "l1": "1", + "l2": "1", + "l3": "1" + } +} diff --git a/tests/data/reports/sample3.mlst.json b/tests/data/reports/sample3.mlst.json index 88c3d0c..43ea3c7 100644 --- a/tests/data/reports/sample3.mlst.json +++ b/tests/data/reports/sample3.mlst.json @@ -1,7 +1,7 @@ -{ - "sample3": { - "l1": "1", - "l2": "1", - "l3": "2" - } -} +{ + "sample3": { + "l1": "1", + "l2": "1", + "l3": "2" + } +} diff --git a/workflows/gas_nomenclature.nf b/workflows/gas_nomenclature.nf index f9a6572..de931e1 100644 --- a/workflows/gas_nomenclature.nf +++ b/workflows/gas_nomenclature.nf @@ -49,13 +49,14 @@ include { PROFILE_DISTS } from "../modules/local/profile_dists/main" */ -def prepareFilePath(String filep){ +def prepareFilePath(String filep, String debug_msg){ // Rerturns null if a file is not valid def return_path = null if(filep){ file_in = path(filep) if(file_in.exists()){ return_path = file_in + log.debug debug_msg } }else{ return_path = [] @@ -74,7 +75,7 @@ workflow GAS_NOMENCLATURE { profiles = input.branch{ ref: it[0].profile_type query: !it[0].profile_type - errors: true // TODO add in check on file for erroneous values, may not be needed as nf-validation is working + errors: true // To discuss, add in check on file for erroneous values, may not be needed as nf-validation is working } reference_values = profiles.ref.collect{ meta, profile -> profile} @@ -92,13 +93,13 @@ workflow GAS_NOMENCLATURE { // PROFILE DISTS processes - mapping_file = prepareFilePath(params.pd_mapping_file) + mapping_file = prepareFilePath(params.pd_mapping_file, "Selecting ${params.pd_mapping_file} for --pd_mapping_file") if(mapping_file == null){ exit 1, "${params.pd_mapping_file}: Does not exist but was passed to the pipeline. Exiting now." } - columns_file = prepareFilePath(params.pd_columns) + columns_file = prepareFilePath(params.pd_columns, "Selecting ${params.pd_columns} for --pd_mapping_file") if(columns_file == null){ exit 1, "${params.pd_columns}: Does not exist but was passed to the pipeline. Exiting now." } From 6893ece76c8aead3bdf7a1a912ddd110d1e3f000 Mon Sep 17 00:00:00 2001 From: Matthew Wells Date: Wed, 3 Apr 2024 11:05:10 -0500 Subject: [PATCH 015/119] updated schema types --- nextflow_schema.json | 128 +++++++++++++++++++++++++------------------ 1 file changed, 74 insertions(+), 54 deletions(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index 71d1fac..f0eb807 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -5,6 +5,73 @@ "description": "IRIDA Next Example Pipeline", "type": "object", "definitions": { + "gas_call": { + "title": "GAS Call", + "type": "object", + "description": "", + "default": "", + "properties": { + "gm_thresholds": { + "type": "string", + "default": "10,5,0" + }, + "gm_delimiter": { + "type": "string", + "default": "\\'.\\" + }, + "ref_clusters": { + "type": "string" + } + } + }, + "profile_dists": { + "title": "Profile Dists", + "type": "object", + "description": "", + "default": "", + "properties": { + "pd_outfmt": { + "type": "string", + "default": "pairwise" + }, + "pd_distm": { + "type": "string", + "default": "scaled" + }, + "pd_missing_threshold": { + "type": "number", + "default": 1 + }, + "pd_sample_quality_threshold": { + "type": "number", + "default": 1 + }, + "pd_match_threshold": { + "type": "number", + "default": -1 + }, + "pd_file_type": { + "type": "string", + "default": "text" + }, + "pd_mapping_file": { + "type": "string" + }, + "pd_force": { + "type": "boolean" + }, + "pd_skip": { + "type": "boolean" + }, + "pd_columns": { + "type": "string" + }, + "pd_count_missing": { + "type": "boolean", + "default": true + } + } + }, "input_output_options": { "title": "Input/output options", "type": "object", @@ -228,6 +295,12 @@ } }, "allOf": [ + { + "$ref": "#/definitions/gas_call" + }, + { + "$ref": "#/definitions/profile_dists" + }, { "$ref": "#/definitions/input_output_options" }, @@ -240,58 +313,5 @@ { "$ref": "#/definitions/generic_options" } - ], - "properties": { - "pd_outfmt": { - "type": "string", - "default": "pairwise" - }, - "pd_distm": { - "type": "string", - "default": "scaled" - }, - "pd_missing_threshold": { - "type": "integer", - "default": 1 - }, - "pd_sample_quality_threshold": { - "type": "integer", - "default": 1 - }, - "pd_match_threshold": { - "type": "integer", - "default": -1 - }, - "pd_file_type": { - "type": "string", - "default": "text" - }, - "pd_mapping_file": { - "type": "string" - }, - "pd_force": { - "type": "boolean" - }, - "pd_skip": { - "type": "boolean" - }, - "pd_columns": { - "type": "string" - }, - "pd_count_missing": { - "type": "boolean", - "default": true - }, - "gm_thresholds": { - "type": "string", - "default": "10,5,0" - }, - "gm_delimiter": { - "type": "string", - "default": "\\'.\\" - }, - "ref_clusters": { - "type": "string" - } - } + ] } \ No newline at end of file From 141b040bbb0b256cb54130d82e87605b3aea3d94 Mon Sep 17 00:00:00 2001 From: Matthew Wells Date: Wed, 3 Apr 2024 11:09:11 -0500 Subject: [PATCH 016/119] Ran prettier --- nextflow_schema.json | 22 ++++------------------ 1 file changed, 4 insertions(+), 18 deletions(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index f0eb807..5799dcf 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -77,10 +77,7 @@ "type": "object", "fa_icon": "fas fa-terminal", "description": "Define where the pipeline should find input data and save output data.", - "required": [ - "input", - "outdir" - ], + "required": ["input", "outdir"], "properties": { "input": { "type": "string", @@ -111,11 +108,7 @@ "default": "stub", "fa_icon": "fas fa-desktop", "description": "The sequence assembler to use for sequence assembly.", - "enum": [ - "default", - "stub", - "experimental" - ] + "enum": ["default", "stub", "experimental"] }, "random_seed": { "type": "integer", @@ -226,14 +219,7 @@ "description": "Method used to save pipeline results to output directory.", "help_text": "The Nextflow `publishDir` option specifies which intermediate files should be saved to the output directory. This option tells the pipeline what method should be used to move these files. See [Nextflow docs](https://www.nextflow.io/docs/latest/process.html#publishdir) for details.", "fa_icon": "fas fa-copy", - "enum": [ - "symlink", - "rellink", - "link", - "copy", - "copyNoFollow", - "move" - ], + "enum": ["symlink", "rellink", "link", "copy", "copyNoFollow", "move"], "hidden": true }, "email_on_fail": { @@ -314,4 +300,4 @@ "$ref": "#/definitions/generic_options" } ] -} \ No newline at end of file +} From 7ce6e9fa56fc3060ac87c6827737e0f9e321ef92 Mon Sep 17 00:00:00 2001 From: Matthew Wells Date: Wed, 3 Apr 2024 11:15:03 -0500 Subject: [PATCH 017/119] updated function data types --- workflows/gas_nomenclature.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/gas_nomenclature.nf b/workflows/gas_nomenclature.nf index de931e1..a0befa9 100644 --- a/workflows/gas_nomenclature.nf +++ b/workflows/gas_nomenclature.nf @@ -49,7 +49,7 @@ include { PROFILE_DISTS } from "../modules/local/profile_dists/main" */ -def prepareFilePath(String filep, String debug_msg){ +def prepareFilePath(String filep, GString debug_msg){ // Rerturns null if a file is not valid def return_path = null if(filep){ From be8a6323840e720a47ea18d8d0490f893973949d Mon Sep 17 00:00:00 2001 From: Matthew Wells <76452933+mattheww95@users.noreply.github.com> Date: Fri, 5 Apr 2024 15:40:32 -0500 Subject: [PATCH 018/119] Update gas_nomenclature.nf reverted the `prepareFilePaths` function to use `file` instead of path --- workflows/gas_nomenclature.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/gas_nomenclature.nf b/workflows/gas_nomenclature.nf index a0befa9..4691ae8 100644 --- a/workflows/gas_nomenclature.nf +++ b/workflows/gas_nomenclature.nf @@ -53,7 +53,7 @@ def prepareFilePath(String filep, GString debug_msg){ // Rerturns null if a file is not valid def return_path = null if(filep){ - file_in = path(filep) + file_in = file(filep) if(file_in.exists()){ return_path = file_in log.debug debug_msg From 1e554ddef598b28f24a483c77c72f0faab7085d9 Mon Sep 17 00:00:00 2001 From: kylacochrane Date: Mon, 6 May 2024 14:56:50 -0400 Subject: [PATCH 019/119] Updated samplesheet to include 'address' column --- assets/samplesheet.csv | 9 +++++---- assets/schema_input.json | 15 ++++++++------- conf/test.config | 2 +- tests/data/samplesheets/samplesheet1.csv | 10 +++++----- workflows/gas_nomenclature.nf | 16 ++++++---------- 5 files changed, 25 insertions(+), 27 deletions(-) diff --git a/assets/samplesheet.csv b/assets/samplesheet.csv index 814a27d..b0d6f9e 100644 --- a/assets/samplesheet.csv +++ b/assets/samplesheet.csv @@ -1,4 +1,5 @@ -sample,fastq_1,fastq_2 -SAMPLE1,https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/illumina/amplicon/sample1_R1.fastq.gz,https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/illumina/amplicon/sample1_R2.fastq.gz -SAMPLE2,https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/illumina/amplicon/sample2_R1.fastq.gz,https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/illumina/amplicon/sample2_R2.fastq.gz -SAMPLE3,https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/illumina/amplicon/sample1_R1.fastq.gz, +sample,mlst_alleles,address +sampleQ,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sampleQ.mlst.json, +sample1,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample1.mlst.json,1.1.1 +sample2,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample2.mlst.json,1.1.1 +sample3,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample3.mlst.json,1.1.2 diff --git a/assets/schema_input.json b/assets/schema_input.json index 028cdfd..48e9936 100644 --- a/assets/schema_input.json +++ b/assets/schema_input.json @@ -14,19 +14,20 @@ "unique": true, "errorMessage": "Sample name must be provided and cannot contain spaces" }, - "profile_type": { - "meta": ["profile_type"], - "description": "Determines has already been clustered (True) or if it is new, and requiring nomenclature assignment (False)", - "errorMessage": "Please specify if the mlst profile has already been clustered (True) or if it is new and requires nomenclature assignment (False)", - "type": "boolean" - }, "mlst_alleles": { "type": "string", "format": "file-path", "pattern": "^\\S+\\.mlst\\.json(\\.gz)?$", "errorMessage": "MLST JSON file from locidex report, cannot contain spaces and must have the extension: '.mlst.json' or '.mlst.json.gz'" + }, + "address": { + "type": "string", + "pattern": "^\\d+(\\.\\d+)*$", + "meta": ["address"], + "description": "The loci-based typing identifier (address) of the sample", + "error_message": "Invalid loci-based typing identifier. Please ensure that the address follows the correct format, consisting of one or more digits separated by periods. Example of a valid identifier: '1.1.1'. Please review and correct the entry" } }, - "required": ["sample", "profile_type", "mlst_alleles"] + "required": ["sample", "mlst_alleles"] } } diff --git a/conf/test.config b/conf/test.config index 9c17e75..9ba7cf1 100644 --- a/conf/test.config +++ b/conf/test.config @@ -20,7 +20,7 @@ params { max_time = '1.h' // Input data - input = 'https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/samplesheets/samplesheet1.csv' + input = "${projectDir}/tests/data/samplesheets/samplesheet1.csv" ref_clusters = 'https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/clusters/expected_clusters.txt' } diff --git a/tests/data/samplesheets/samplesheet1.csv b/tests/data/samplesheets/samplesheet1.csv index 8b36335..b0d6f9e 100644 --- a/tests/data/samplesheets/samplesheet1.csv +++ b/tests/data/samplesheets/samplesheet1.csv @@ -1,5 +1,5 @@ -sample,profile_type,mlst_alleles -sampleQ,false,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sampleQ.mlst.json -sample1,true,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample1.mlst.json -sample2,true,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample2.mlst.json -sample3,true,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample3.mlst.json +sample,mlst_alleles,address +sampleQ,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sampleQ.mlst.json, +sample1,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample1.mlst.json,1.1.1 +sample2,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample2.mlst.json,1.1.1 +sample3,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample3.mlst.json,1.1.2 diff --git a/workflows/gas_nomenclature.nf b/workflows/gas_nomenclature.nf index 4691ae8..9c0a238 100644 --- a/workflows/gas_nomenclature.nf +++ b/workflows/gas_nomenclature.nf @@ -22,10 +22,10 @@ include { paramsSummaryLog; paramsSummaryMap; fromSamplesheet } from 'plugin/nf // SUBWORKFLOW: Consisting of a mix of local and nf-core/modules // -include { GENERATE_SAMPLE_JSON } from '../modules/local/generatesamplejson/main' -include { SIMPLIFY_IRIDA_JSON } from '../modules/local/simplifyiridajson/main' -include { IRIDA_NEXT_OUTPUT } from '../modules/local/iridanextoutput/main' -include { GENERATE_SUMMARY } from '../modules/local/generatesummary/main' +include { LOCIDEX_MERGE as LOCIDEX_MERGE_REF } from "../modules/local/locidex/merge/main" +include { LOCIDEX_MERGE as LOCIDEX_MERGE_QUERY } from "../modules/local/locidex/merge/main" +include { PROFILE_DISTS } from "../modules/local/profile_dists/main" +include { GAS_CALL } from "../modules/local/gas/call/main" /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -37,10 +37,6 @@ include { GENERATE_SUMMARY } from '../modules/local/generatesummary/main' // MODULE: Installed directly from nf-core/modules // include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/custom/dumpsoftwareversions/main' -include { LOCIDEX_MERGE as LOCIDEX_MERGE_REF } from "../modules/local/locidex/merge/main" -include { LOCIDEX_MERGE as LOCIDEX_MERGE_QUERY } from "../modules/local/locidex/merge/main" -include { GAS_CALL } from "../modules/local/gas/call/main" -include { PROFILE_DISTS } from "../modules/local/profile_dists/main" /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -73,8 +69,8 @@ workflow GAS_NOMENCLATURE { // NB: `input` corresponds to `params.input` and associated sample sheet schema input = Channel.fromSamplesheet("input"); profiles = input.branch{ - ref: it[0].profile_type - query: !it[0].profile_type + ref: it[0].address + query: !it[0].address errors: true // To discuss, add in check on file for erroneous values, may not be needed as nf-validation is working } From adfe193a6bfa0782d88c866cf14e9b590e01e0d4 Mon Sep 17 00:00:00 2001 From: kylacochrane Date: Mon, 6 May 2024 15:26:01 -0400 Subject: [PATCH 020/119] Updated locidex_merge_ref to include all samples, including new query samples --- modules/local/assemblystub/main.nf | 33 ---------------- modules/local/generatesamplejson/main.nf | 49 ------------------------ modules/local/generatesummary/main.nf | 38 ------------------ modules/local/iridanextoutput/main.nf | 31 --------------- modules/local/locidex/merge/main.nf | 1 + modules/local/simplifyiridajson/main.nf | 33 ---------------- nextflow.config | 2 +- workflows/gas_nomenclature.nf | 4 +- 8 files changed, 4 insertions(+), 187 deletions(-) delete mode 100644 modules/local/assemblystub/main.nf delete mode 100644 modules/local/generatesamplejson/main.nf delete mode 100644 modules/local/generatesummary/main.nf delete mode 100644 modules/local/iridanextoutput/main.nf delete mode 100644 modules/local/simplifyiridajson/main.nf diff --git a/modules/local/assemblystub/main.nf b/modules/local/assemblystub/main.nf deleted file mode 100644 index 00f27d2..0000000 --- a/modules/local/assemblystub/main.nf +++ /dev/null @@ -1,33 +0,0 @@ -process ASSEMBLY_STUB { - tag "$meta.id" - label 'process_single' - - container 'docker.io/python:3.9.17' - - input: - tuple val(meta), path(reads) - - output: - tuple val(meta), path("*.assembly.fa.gz"), emit: assembly - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" - """ - cat <<-EOF > ${prefix}.assembly.fa - >${meta.id}-stub-assembly - ACGTAACCGGTTAAACCCGGGTTTAAAACCCCGGGGTTTTAAAAACCCCCGGGGGTTTTT - EOF - - gzip -n ${prefix}.assembly.fa - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - assemblystub : 0.1.0 - END_VERSIONS - """ -} diff --git a/modules/local/generatesamplejson/main.nf b/modules/local/generatesamplejson/main.nf deleted file mode 100644 index f3b5cd3..0000000 --- a/modules/local/generatesamplejson/main.nf +++ /dev/null @@ -1,49 +0,0 @@ -process GENERATE_SAMPLE_JSON { - tag "$meta.id" - label 'process_single' - - container 'docker.io/python:3.9.17' - - input: - tuple val(meta), path(reads), path(assembly) - - output: - tuple val(meta), path("*.json.gz"), emit: json - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" - def assembly_path = ["${task.assembly_directory_name}", "${assembly}"].join(File.separator) - """ - cat <<-EOF > "${meta.id}.json" - { - "files": { - "samples": { - "${meta.id}": [ - { - "path": "${assembly_path}" - } - ] - } - }, - "metadata": { - "samples": { - "${meta.id}": { - "reads": ["${reads[0]}", "${reads[1]}"] - } - } - } - } - EOF - gzip ${meta.id}.json - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - generatesamplejson : 0.1.0 - END_VERSIONS - """ -} diff --git a/modules/local/generatesummary/main.nf b/modules/local/generatesummary/main.nf deleted file mode 100644 index a3d0245..0000000 --- a/modules/local/generatesummary/main.nf +++ /dev/null @@ -1,38 +0,0 @@ -process GENERATE_SUMMARY { - label 'process_single' - container 'docker.io/python:3.9.17' - - input: - val summaries - - output: - path("summary.txt.gz"), emit: summary - path "versions.yml", emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def args = task.ext.args ?: '' - def sorted_summaries = summaries.sort{ it[0].id } - - // Generate summary text: - def summary_text = "IRIDANEXTEXAMPLE Pipeline Summary\n\nSUCCESS!\n" - - // TODO: Consider the possibility of code injection. - // Should probably be moved to file processing through Python. - for (summary in sorted_summaries) { - summary_text += "\n${summary[0].id}:\n" - summary_text += " reads.1: ${summary[1][0]}\n" - summary_text += " reads.2: ${summary[1][1]}\n" - summary_text += " assembly: ${summary[2]}\n" - } - - version_text = "\"${task.process}\":\n generatesummary : 0.1.0" - - """ - echo "${summary_text}" > summary.txt - gzip -n summary.txt - echo "${version_text}" > versions.yml - """ -} diff --git a/modules/local/iridanextoutput/main.nf b/modules/local/iridanextoutput/main.nf deleted file mode 100644 index 92595ee..0000000 --- a/modules/local/iridanextoutput/main.nf +++ /dev/null @@ -1,31 +0,0 @@ -process IRIDA_NEXT_OUTPUT { - label 'process_single' - - container 'docker.io/python:3.9.17' - - input: - path(samples_data) - - output: - path("iridanext.output.json.gz"), emit: output_json - path "versions.yml", emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def args = task.ext.args ?: '' - def samples_data_dir = "samples_data" - """ - irida-next-output.py \\ - $args \\ - --summary-file ${task.summary_directory_name}/summary.txt.gz \\ - --json-output iridanext.output.json.gz \\ - ${samples_data} - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - iridanextoutput : 0.1.0 - END_VERSIONS - """ -} diff --git a/modules/local/locidex/merge/main.nf b/modules/local/locidex/merge/main.nf index b58b154..7721625 100644 --- a/modules/local/locidex/merge/main.nf +++ b/modules/local/locidex/merge/main.nf @@ -22,6 +22,7 @@ process LOCIDEX_MERGE { locidex merge -i ${input_values.join(' ')} -o ${combined_dir} mv ${combined_dir}/*.tsv ${combined_dir}/merged_profiles_${input_tag}.tsv + cat <<-END_VERSIONS > versions.yml "${task.process}": locidex merge: \$(echo \$(locidex search -V 2>&1) | sed 's/^.*locidex //' ) diff --git a/modules/local/simplifyiridajson/main.nf b/modules/local/simplifyiridajson/main.nf deleted file mode 100644 index e2e7352..0000000 --- a/modules/local/simplifyiridajson/main.nf +++ /dev/null @@ -1,33 +0,0 @@ -process SIMPLIFY_IRIDA_JSON { - tag "$meta.id" - label 'process_single' - - container 'docker.io/python:3.9.17' - - input: - tuple val(meta), path(json) - - output: - tuple val(meta), path("*.simple.json.gz") , emit: simple_json - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" - """ - simplify_irida_json.py \\ - $args \\ - --json-output ${meta.id}.simple.json \\ - ${json} - - gzip ${meta.id}.simple.json - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - simplifyiridajson : 0.1.0 - END_VERSIONS - """ -} diff --git a/nextflow.config b/nextflow.config index 89da1c6..a303213 100644 --- a/nextflow.config +++ b/nextflow.config @@ -48,7 +48,7 @@ params { // Profile Dists pd_outfmt = "pairwise" - pd_distm = "scaled" + pd_distm = "hamming" pd_missing_threshold = 1.0 pd_sample_quality_threshold = 1.0 pd_match_threshold = -1.0 diff --git a/workflows/gas_nomenclature.nf b/workflows/gas_nomenclature.nf index 9c0a238..66e5c47 100644 --- a/workflows/gas_nomenclature.nf +++ b/workflows/gas_nomenclature.nf @@ -74,19 +74,19 @@ workflow GAS_NOMENCLATURE { errors: true // To discuss, add in check on file for erroneous values, may not be needed as nf-validation is working } - reference_values = profiles.ref.collect{ meta, profile -> profile} + reference_values = input.collect{ meta, profile -> profile} query_values = profiles.query.collect{ meta, profile -> profile } // LOCIDEX modules ref_tag = Channel.value("ref") query_tag = Channel.value("value") + merged_references = LOCIDEX_MERGE_REF(reference_values, ref_tag) ch_versions = ch_versions.mix(merged_references.versions) merged_queries = LOCIDEX_MERGE_QUERY(query_values, query_tag) ch_versions = ch_versions.mix(merged_queries.versions) - // PROFILE DISTS processes mapping_file = prepareFilePath(params.pd_mapping_file, "Selecting ${params.pd_mapping_file} for --pd_mapping_file") From 2a791000f8a40dd07fb5c51bc48f897eb5e6fafa Mon Sep 17 00:00:00 2001 From: kylacochrane Date: Mon, 6 May 2024 16:07:04 -0400 Subject: [PATCH 021/119] Updated expected test results to accomdate all samples in reference --- nextflow_schema.json | 2 +- tests/data/called/expected_results.txt | 2 +- tests/data/distances/expected_dists.tsv | 9 +++++---- tests/data/distances/expected_pairwise_dists.txt | 7 ++++--- tests/data/profiles/expected-profile1.tsv | 1 + 5 files changed, 12 insertions(+), 9 deletions(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index 5799dcf..d82d41f 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -36,7 +36,7 @@ }, "pd_distm": { "type": "string", - "default": "scaled" + "default": "hamming" }, "pd_missing_threshold": { "type": "number", diff --git a/tests/data/called/expected_results.txt b/tests/data/called/expected_results.txt index 8a3eec9..0d30de2 100644 --- a/tests/data/called/expected_results.txt +++ b/tests/data/called/expected_results.txt @@ -2,4 +2,4 @@ id address level_1 level_2 level_3 sample1 1.1.1 1 1 1 sample2 1.1.1 1 1 1 sample3 2.2.2 2 2 2 -sampleQ 1.1.1 1 1 1 +sampleQ 1.1.3 1 1 3 diff --git a/tests/data/distances/expected_dists.tsv b/tests/data/distances/expected_dists.tsv index 00e9cec..45bdd70 100644 --- a/tests/data/distances/expected_dists.tsv +++ b/tests/data/distances/expected_dists.tsv @@ -1,4 +1,5 @@ -dists sample1 sample2 sample3 -sample1 0.0 0.0 33.333333333333336 -sample2 0.0 0.0 33.333333333333336 -sample3 33.333333333333336 33.333333333333336 0.0 +dists sampleQ sample1 sample2 sample3 +sampleQ 0 1 1 2 +sample1 1 0 0 1 +sample2 1 0 0 1 +sample3 2 1 1 0 diff --git a/tests/data/distances/expected_pairwise_dists.txt b/tests/data/distances/expected_pairwise_dists.txt index df58510..84ea004 100644 --- a/tests/data/distances/expected_pairwise_dists.txt +++ b/tests/data/distances/expected_pairwise_dists.txt @@ -1,4 +1,5 @@ query_id ref_id dist -sampleQ sample1 0.0 -sampleQ sample2 33.333333333333336 -sampleQ sample3 66.66666666666667 +sampleQ sampleQ 0 +sampleQ sample1 1 +sampleQ sample2 1 +sampleQ sample3 2 diff --git a/tests/data/profiles/expected-profile1.tsv b/tests/data/profiles/expected-profile1.tsv index 9b938e1..6d02526 100644 --- a/tests/data/profiles/expected-profile1.tsv +++ b/tests/data/profiles/expected-profile1.tsv @@ -1,4 +1,5 @@ sample_id l1 l2 l3 +sampleQ 1 2 1 sample1 1 1 1 sample2 1 1 1 sample3 1 1 2 From 11427882bf8a1cfd4b926a2a31dc41c73cf5654f Mon Sep 17 00:00:00 2001 From: kylacochrane Date: Mon, 6 May 2024 16:17:54 -0400 Subject: [PATCH 022/119] EC error fix --- workflows/gas_nomenclature.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/gas_nomenclature.nf b/workflows/gas_nomenclature.nf index 66e5c47..f6b1d43 100644 --- a/workflows/gas_nomenclature.nf +++ b/workflows/gas_nomenclature.nf @@ -80,7 +80,7 @@ workflow GAS_NOMENCLATURE { // LOCIDEX modules ref_tag = Channel.value("ref") query_tag = Channel.value("value") - + merged_references = LOCIDEX_MERGE_REF(reference_values, ref_tag) ch_versions = ch_versions.mix(merged_references.versions) From b5ee7741585e70483f2850fff9f48d8c3a1bcb4d Mon Sep 17 00:00:00 2001 From: kylacochrane Date: Mon, 6 May 2024 18:16:10 -0400 Subject: [PATCH 023/119] New module to filter out the new query addresses --- modules/local/filter/main.nf | 49 +++++++++++++++++++++++++++++++++++ workflows/gas_nomenclature.nf | 26 +++---------------- 2 files changed, 52 insertions(+), 23 deletions(-) create mode 100644 modules/local/filter/main.nf diff --git a/modules/local/filter/main.nf b/modules/local/filter/main.nf new file mode 100644 index 0000000..12ea46f --- /dev/null +++ b/modules/local/filter/main.nf @@ -0,0 +1,49 @@ +process FILTER_NEW { + tag "Filter New Query Addresses" + label 'process_single' + + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/csvtk:0.22.0--h9ee0642_1' : + 'biocontainers/csvtk:0.22.0--h9ee0642_1' }" + + input: + val input_query + path addresses + val in_format + val out_format + + output: + path("new_addresses.csv"), emit: csv + path("new_addresses.json"), emit: json + + script: + + def queryID = input_query[0].id + def outputFile = "new_addresses" + + def delimiter = in_format == "tsv" ? "\t" : (in_format == "csv" ? "," : in_format) + def out_delimiter = out_format == "tsv" ? "\t" : (out_format == "csv" ? "," : out_format) + def out_extension = out_format == "tsv" ? 'tsv' : 'csv' + + """ + # Filter the query samples only; keep only the 'id' and 'address' columns + csvtk filter2 \\ + ${addresses} \\ + --filter '\$id == \"$queryID\"' \\ + --delimiter "${delimiter}" \\ + --out-delimiter "${out_delimiter}" \\ + --out-file ${outputFile}.tmp + + csvtk cut -f 1,2 ${outputFile}.tmp > ${outputFile}.${out_extension} + rm ${outputFile}.tmp + + # Convert the CSV file to a JSON file array with 'id' as the key + csvtk csv2json ${outputFile}.${out_extension} -k id > ${outputFile}.json + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + csvtk: \$(echo \$( csvtk version | sed -e "s/csvtk v//g" )) + END_VERSIONS + """ +} + diff --git a/workflows/gas_nomenclature.nf b/workflows/gas_nomenclature.nf index f6b1d43..edff908 100644 --- a/workflows/gas_nomenclature.nf +++ b/workflows/gas_nomenclature.nf @@ -26,6 +26,7 @@ include { LOCIDEX_MERGE as LOCIDEX_MERGE_REF } from "../modules/local/locidex include { LOCIDEX_MERGE as LOCIDEX_MERGE_QUERY } from "../modules/local/locidex/merge/main" include { PROFILE_DISTS } from "../modules/local/profile_dists/main" include { GAS_CALL } from "../modules/local/gas/call/main" +include { FILTER_NEW } from "../modules/local/filter/main" /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -117,30 +118,9 @@ workflow GAS_NOMENCLATURE { ch_versions = ch_versions.mix(called_data.versions) + // Filter the new queried samples and addresses into a CSV/JSON file for the IRIDANext plug in - // A channel of tuples of ({meta}, [read[0], read[1]], assembly) - //ch_tuple_read_assembly = input.join(ASSEMBLY_STUB.out.assembly) - - //GENERATE_SAMPLE_JSON ( - // ch_tuple_read_assembly - //) - //ch_versions = ch_versions.mix(GENERATE_SAMPLE_JSON.out.versions) - - //GENERATE_SUMMARY ( - // ch_tuple_read_assembly.collect{ [it] } - //) - //ch_versions = ch_versions.mix(GENERATE_SUMMARY.out.versions) - - //SIMPLIFY_IRIDA_JSON ( - // GENERATE_SAMPLE_JSON.out.json - //) - //ch_versions = ch_versions.mix(SIMPLIFY_IRIDA_JSON.out.versions) - //ch_simplified_jsons = SIMPLIFY_IRIDA_JSON.out.simple_json.map { meta, data -> data }.collect() // Collect JSONs - - //IRIDA_NEXT_OUTPUT ( - // samples_data=ch_simplified_jsons - //) - //ch_versions = ch_versions.mix(IRIDA_NEXT_OUTPUT.out.versions) + new_addresses = FILTER_NEW(profiles.query, called_data.distances, "tsv", "csv") CUSTOM_DUMPSOFTWAREVERSIONS ( ch_versions.unique().collectFile(name: 'collated_versions.yml') From d6f315b05aa69780b3f5300caf8e83827d0047a0 Mon Sep 17 00:00:00 2001 From: kylacochrane Date: Tue, 7 May 2024 11:12:56 -0400 Subject: [PATCH 024/119] Added nf-iridanext plugin --- conf/iridanext.config | 15 +++++++++++++++ modules/local/filter/main.nf | 4 ---- nextflow.config | 4 ++++ 3 files changed, 19 insertions(+), 4 deletions(-) create mode 100644 conf/iridanext.config diff --git a/conf/iridanext.config b/conf/iridanext.config new file mode 100644 index 0000000..f02eb47 --- /dev/null +++ b/conf/iridanext.config @@ -0,0 +1,15 @@ +iridanext { + enabled = true + output { + path = "${params.outdir}/iridanext.output.json.gz" + overwrite = true + metadata { + samples { + csv { + path = "**/filter/new_addresses.csv" + idcol = "id" + } + } + } + } +} diff --git a/modules/local/filter/main.nf b/modules/local/filter/main.nf index 12ea46f..befac87 100644 --- a/modules/local/filter/main.nf +++ b/modules/local/filter/main.nf @@ -14,7 +14,6 @@ process FILTER_NEW { output: path("new_addresses.csv"), emit: csv - path("new_addresses.json"), emit: json script: @@ -37,9 +36,6 @@ process FILTER_NEW { csvtk cut -f 1,2 ${outputFile}.tmp > ${outputFile}.${out_extension} rm ${outputFile}.tmp - # Convert the CSV file to a JSON file array with 'id' as the key - csvtk csv2json ${outputFile}.${out_extension} -k id > ${outputFile}.json - cat <<-END_VERSIONS > versions.yml "${task.process}": csvtk: \$(echo \$( csvtk version | sed -e "s/csvtk v//g" )) diff --git a/nextflow.config b/nextflow.config index a303213..9f9d8ff 100644 --- a/nextflow.config +++ b/nextflow.config @@ -177,8 +177,12 @@ singularity.registry = 'quay.io' // Nextflow plugins plugins { id 'nf-validation@1.1.3' // Validation of pipeline parameters and creation of an input channel from a sample sheet + id 'nf-iridanext@0.2.0' // Generation of JSON output for IRIDA Next } +// Load iridanext.config for specific options +includeConfig 'conf/iridanext.config' + // Export these variables to prevent local Python/R libraries from conflicting with those in the container // The JULIA depot path has been adjusted to a fixed path `/usr/local/share/julia` that needs to be used for packages in the container. // See https://apeltzer.github.io/post/03-julia-lang-nextflow/ for details on that. Once we have a common agreement on where to keep Julia packages, this is adjustable. From 69dd10a76ea9841e782e475d2bfc9026492c5707 Mon Sep 17 00:00:00 2001 From: kylacochrane Date: Tue, 7 May 2024 11:18:12 -0400 Subject: [PATCH 025/119] EC error fix --- conf/iridanext.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conf/iridanext.config b/conf/iridanext.config index f02eb47..b362249 100644 --- a/conf/iridanext.config +++ b/conf/iridanext.config @@ -3,7 +3,7 @@ iridanext { output { path = "${params.outdir}/iridanext.output.json.gz" overwrite = true - metadata { + metadata { samples { csv { path = "**/filter/new_addresses.csv" From 7a9eeb9e9efba6d47b78a4e24d8533bbd1882ec9 Mon Sep 17 00:00:00 2001 From: kylacochrane Date: Wed, 8 May 2024 17:41:16 -0400 Subject: [PATCH 026/119] Adding a basic solution for identifying when sample ID's do not match to mlst.json keys --- bin/input_check.py | 32 +++++++++++++++++++++++++++++++ modules/local/input_check/main.nf | 27 ++++++++++++++++++++++++++ workflows/gas_nomenclature.nf | 8 +++++++- 3 files changed, 66 insertions(+), 1 deletion(-) create mode 100755 bin/input_check.py create mode 100644 modules/local/input_check/main.nf diff --git a/bin/input_check.py b/bin/input_check.py new file mode 100755 index 0000000..0a390f7 --- /dev/null +++ b/bin/input_check.py @@ -0,0 +1,32 @@ +#!/usr/bin/env python + +import json +import argparse +import csv + + +def check_inputs(json_file, sample_id, output_file): + # Define a variable to store the match status + json_data = json.load(open(json_file)) + match_status = sample_id in json_data + + # Write match status to error report CSV + if not match_status: + with open(output_file, "w", newline="") as f: + writer = csv.writer(f) + writer.writerow(["sample", "Sample ID matches MLST.JSON key?"]) + writer.writerow([sample_id, match_status]) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Check sample inputs") + parser.add_argument("--input", help="Missing mlst.json file path", required=True) + parser.add_argument( + "--sample_id", help="Missing sample meta.id path", required=True + ) + parser.add_argument( + "--output", help="Requires an error report file path", required=True + ) + args = parser.parse_args() + + check_inputs(args.input, args.sample_id, args.output) diff --git a/modules/local/input_check/main.nf b/modules/local/input_check/main.nf new file mode 100644 index 0000000..1c4ac08 --- /dev/null +++ b/modules/local/input_check/main.nf @@ -0,0 +1,27 @@ +process INPUT_CHECK{ + tag "Check Sample Inputs" + label 'process_single' + + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/python:3.8.3' : + 'biocontainers/python:3.8.3' }" + + input: + tuple val(meta), path(mlst) + + output: + path("*_error_report.csv"), optional: true, emit: sample_check + + script: + """ + input_check.py \\ + --input $mlst \\ + --sample_id ${meta.id} \\ + --output ${meta.id}_error_report.csv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | sed 's/Python //g') + END_VERSIONS + """ +} diff --git a/workflows/gas_nomenclature.nf b/workflows/gas_nomenclature.nf index edff908..9fec652 100644 --- a/workflows/gas_nomenclature.nf +++ b/workflows/gas_nomenclature.nf @@ -22,6 +22,7 @@ include { paramsSummaryLog; paramsSummaryMap; fromSamplesheet } from 'plugin/nf // SUBWORKFLOW: Consisting of a mix of local and nf-core/modules // +include { INPUT_CHECK } from "../modules/local/input_check/main" include { LOCIDEX_MERGE as LOCIDEX_MERGE_REF } from "../modules/local/locidex/merge/main" include { LOCIDEX_MERGE as LOCIDEX_MERGE_QUERY } from "../modules/local/locidex/merge/main" include { PROFILE_DISTS } from "../modules/local/profile_dists/main" @@ -68,7 +69,12 @@ workflow GAS_NOMENCLATURE { // Create a new channel of metadata from a sample sheet // NB: `input` corresponds to `params.input` and associated sample sheet schema - input = Channel.fromSamplesheet("input"); + input = Channel.fromSamplesheet("input") + + // Ensure meta.id and mlst_file keys match + mlst_merged = INPUT_CHECK(input) + + // Prepare reference and query TSV files for LOCIDEX_MERGE profiles = input.branch{ ref: it[0].address query: !it[0].address From 3ec26e28256bfca7e6074c5a545bc4ff90309eaf Mon Sep 17 00:00:00 2001 From: kylacochrane Date: Wed, 15 May 2024 16:50:54 -0400 Subject: [PATCH 027/119] Update query filtering process name and add versions --- modules/local/filter/main.nf | 3 ++- workflows/gas_nomenclature.nf | 6 +++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/modules/local/filter/main.nf b/modules/local/filter/main.nf index befac87..305af37 100644 --- a/modules/local/filter/main.nf +++ b/modules/local/filter/main.nf @@ -1,4 +1,4 @@ -process FILTER_NEW { +process FILTER_QUERY { tag "Filter New Query Addresses" label 'process_single' @@ -14,6 +14,7 @@ process FILTER_NEW { output: path("new_addresses.csv"), emit: csv + path("versions.yml"), emit: versions script: diff --git a/workflows/gas_nomenclature.nf b/workflows/gas_nomenclature.nf index edff908..64f6e48 100644 --- a/workflows/gas_nomenclature.nf +++ b/workflows/gas_nomenclature.nf @@ -26,7 +26,7 @@ include { LOCIDEX_MERGE as LOCIDEX_MERGE_REF } from "../modules/local/locidex include { LOCIDEX_MERGE as LOCIDEX_MERGE_QUERY } from "../modules/local/locidex/merge/main" include { PROFILE_DISTS } from "../modules/local/profile_dists/main" include { GAS_CALL } from "../modules/local/gas/call/main" -include { FILTER_NEW } from "../modules/local/filter/main" +include { FILTER_QUERY } from "../modules/local/filter/main" /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -119,8 +119,8 @@ workflow GAS_NOMENCLATURE { ch_versions = ch_versions.mix(called_data.versions) // Filter the new queried samples and addresses into a CSV/JSON file for the IRIDANext plug in - - new_addresses = FILTER_NEW(profiles.query, called_data.distances, "tsv", "csv") + new_addresses = FILTER_QUERY(profiles.query, called_data.distances, "tsv", "csv") + ch_versions = ch_versions.mix(new_addresses.versions) CUSTOM_DUMPSOFTWAREVERSIONS ( ch_versions.unique().collectFile(name: 'collated_versions.yml') From d80f3470c9cc062daba2117ef28ebc20563ce007 Mon Sep 17 00:00:00 2001 From: kylacochrane Date: Wed, 15 May 2024 17:02:50 -0400 Subject: [PATCH 028/119] Updated linting to resolve issues --- .github/workflows/linting.yml | 19 +++++++++---------- .github/workflows/linting_comment.yml | 2 +- .nf-core.yml | 1 + conf/modules.config | 26 -------------------------- 4 files changed, 11 insertions(+), 37 deletions(-) diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml index 073e187..036c119 100644 --- a/.github/workflows/linting.yml +++ b/.github/workflows/linting.yml @@ -14,13 +14,12 @@ jobs: pre-commit: runs-on: ubuntu-latest steps: - - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4 + - uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b # v4 - - name: Set up Python 3.11 - uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5 + - name: Set up Python 3.12 + uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5 with: - python-version: 3.11 - cache: "pip" + python-version: 3.12 - name: Install pre-commit run: pip install pre-commit @@ -32,14 +31,14 @@ jobs: runs-on: ubuntu-latest steps: - name: Check out pipeline code - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4 + uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b # v4 - name: Install Nextflow - uses: nf-core/setup-nextflow@v1 + uses: nf-core/setup-nextflow@v2 - - uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5 + - uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5 with: - python-version: "3.11" + python-version: "3.12" architecture: "x64" - name: Install dependencies @@ -60,7 +59,7 @@ jobs: - name: Upload linting log file artifact if: ${{ always() }} - uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3 # v4 + uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808 # v4 with: name: linting-logs path: | diff --git a/.github/workflows/linting_comment.yml b/.github/workflows/linting_comment.yml index b706875..40acc23 100644 --- a/.github/workflows/linting_comment.yml +++ b/.github/workflows/linting_comment.yml @@ -11,7 +11,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Download lint results - uses: dawidd6/action-download-artifact@f6b0bace624032e30a85a8fd9c1a7f8f611f5737 # v3 + uses: dawidd6/action-download-artifact@09f2f74827fd3a8607589e5ad7f9398816f540fe # v3 with: workflow: linting.yml workflow_conclusion: completed diff --git a/.nf-core.yml b/.nf-core.yml index 1c764f6..e7be709 100644 --- a/.nf-core.yml +++ b/.nf-core.yml @@ -1,5 +1,6 @@ repository_type: pipeline +nf_core_version: "2.14.1" lint: files_exist: - assets/nf-core-gasnomenclature_logo_light.png diff --git a/conf/modules.config b/conf/modules.config index 86c0455..3b92819 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -35,30 +35,6 @@ process { ] } - withName: ASSEMBLY_STUB { - publishDir = [ - path: { ["${params.outdir}", "${task.assembly_directory_name}"].join(File.separator) }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - } - - withName: GENERATE_SUMMARY { - publishDir = [ - path: { ["${params.outdir}", "${task.summary_directory_name}"].join(File.separator) }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - } - - withName: IRIDA_NEXT_OUTPUT { - publishDir = [ - path: { "${params.outdir}" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - } - withName: LOCIDEX_MERGE_REF { publishDir = [ path: locidex_merge_ref_directory_name, @@ -77,7 +53,6 @@ process { ] } - withName: PROFILE_DISTS { publishDir = [ path: { ["${params.outdir}", "${task.profile_dists_directory_name}"].join(File.separator) }, @@ -107,7 +82,6 @@ process { ] } - withName: CUSTOM_DUMPSOFTWAREVERSIONS { publishDir = [ path: { "${params.outdir}/pipeline_info" }, From 5aaaa3bf15e257dd588fe72e0979405599dc4254 Mon Sep 17 00:00:00 2001 From: kylacochrane Date: Wed, 15 May 2024 17:06:05 -0400 Subject: [PATCH 029/119] Fixed another linting issue --- .github/workflows/linting.yml | 19 ++++++++++--------- .github/workflows/linting_comment.yml | 2 +- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml index 036c119..073e187 100644 --- a/.github/workflows/linting.yml +++ b/.github/workflows/linting.yml @@ -14,12 +14,13 @@ jobs: pre-commit: runs-on: ubuntu-latest steps: - - uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b # v4 + - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4 - - name: Set up Python 3.12 - uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5 + - name: Set up Python 3.11 + uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5 with: - python-version: 3.12 + python-version: 3.11 + cache: "pip" - name: Install pre-commit run: pip install pre-commit @@ -31,14 +32,14 @@ jobs: runs-on: ubuntu-latest steps: - name: Check out pipeline code - uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b # v4 + uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4 - name: Install Nextflow - uses: nf-core/setup-nextflow@v2 + uses: nf-core/setup-nextflow@v1 - - uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5 + - uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5 with: - python-version: "3.12" + python-version: "3.11" architecture: "x64" - name: Install dependencies @@ -59,7 +60,7 @@ jobs: - name: Upload linting log file artifact if: ${{ always() }} - uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808 # v4 + uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3 # v4 with: name: linting-logs path: | diff --git a/.github/workflows/linting_comment.yml b/.github/workflows/linting_comment.yml index 40acc23..b706875 100644 --- a/.github/workflows/linting_comment.yml +++ b/.github/workflows/linting_comment.yml @@ -11,7 +11,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Download lint results - uses: dawidd6/action-download-artifact@09f2f74827fd3a8607589e5ad7f9398816f540fe # v3 + uses: dawidd6/action-download-artifact@f6b0bace624032e30a85a8fd9c1a7f8f611f5737 # v3 with: workflow: linting.yml workflow_conclusion: completed From 272c8ac221c21c1e19d98ad33aa5f5d7b9c8edd8 Mon Sep 17 00:00:00 2001 From: kylacochrane Date: Wed, 15 May 2024 17:10:08 -0400 Subject: [PATCH 030/119] Fixing linting issues, for real this time --- .github/workflows/linting.yml | 19 +++++++++---------- .github/workflows/linting_comment.yml | 2 +- 2 files changed, 10 insertions(+), 11 deletions(-) diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml index 073e187..1fcafe8 100644 --- a/.github/workflows/linting.yml +++ b/.github/workflows/linting.yml @@ -14,13 +14,12 @@ jobs: pre-commit: runs-on: ubuntu-latest steps: - - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4 + - uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b # v4 - - name: Set up Python 3.11 - uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5 + - name: Set up Python 3.12 + uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5 with: - python-version: 3.11 - cache: "pip" + python-version: "3.12" - name: Install pre-commit run: pip install pre-commit @@ -32,14 +31,14 @@ jobs: runs-on: ubuntu-latest steps: - name: Check out pipeline code - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4 + uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b # v4 - name: Install Nextflow - uses: nf-core/setup-nextflow@v1 + uses: nf-core/setup-nextflow@v2 - - uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5 + - uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5 with: - python-version: "3.11" + python-version: "3.12" architecture: "x64" - name: Install dependencies @@ -60,7 +59,7 @@ jobs: - name: Upload linting log file artifact if: ${{ always() }} - uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3 # v4 + uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808 # v4 with: name: linting-logs path: | diff --git a/.github/workflows/linting_comment.yml b/.github/workflows/linting_comment.yml index b706875..40acc23 100644 --- a/.github/workflows/linting_comment.yml +++ b/.github/workflows/linting_comment.yml @@ -11,7 +11,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Download lint results - uses: dawidd6/action-download-artifact@f6b0bace624032e30a85a8fd9c1a7f8f611f5737 # v3 + uses: dawidd6/action-download-artifact@09f2f74827fd3a8607589e5ad7f9398816f540fe # v3 with: workflow: linting.yml workflow_conclusion: completed From b9367ec6f3077897cdeb3aa6ae39cb4a48b1db67 Mon Sep 17 00:00:00 2001 From: kylacochrane Date: Wed, 15 May 2024 18:09:58 -0400 Subject: [PATCH 031/119] Add new modules/processes: INPUT_CHECK, SAMPLE_FILTER, ERROR_REPORT --- modules/local/error_report/main.nf | 29 ++++++++++++++++++ modules/local/filter_query/main.nf | 46 +++++++++++++++++++++++++++++ modules/local/input_check/main.nf | 9 ++++-- modules/local/sample_filter/main.nf | 30 +++++++++++++++++++ 4 files changed, 111 insertions(+), 3 deletions(-) create mode 100644 modules/local/error_report/main.nf create mode 100644 modules/local/filter_query/main.nf create mode 100644 modules/local/sample_filter/main.nf diff --git a/modules/local/error_report/main.nf b/modules/local/error_report/main.nf new file mode 100644 index 0000000..f1af016 --- /dev/null +++ b/modules/local/error_report/main.nf @@ -0,0 +1,29 @@ +process ERROR_REPORT { + tag "Generates Error Report" + label 'process_single' + + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/python:3.8.3' : + 'biocontainers/python:3.8.3' }" + + input: + tuple val(meta), path(mlst) + + output: + tuple val(meta), path("*_error_report.csv"), optional: true, emit: error_report + path("versions.yml"), emit: versions + + script: + """ + error_report.py \\ + --input ${mlst} \\ + --sample_id ${meta.id} \\ + --address ${meta.address} \\ + --output ${meta.id}_error_report.csv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | sed 's/Python //g') + END_VERSIONS + """ +} diff --git a/modules/local/filter_query/main.nf b/modules/local/filter_query/main.nf new file mode 100644 index 0000000..305af37 --- /dev/null +++ b/modules/local/filter_query/main.nf @@ -0,0 +1,46 @@ +process FILTER_QUERY { + tag "Filter New Query Addresses" + label 'process_single' + + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/csvtk:0.22.0--h9ee0642_1' : + 'biocontainers/csvtk:0.22.0--h9ee0642_1' }" + + input: + val input_query + path addresses + val in_format + val out_format + + output: + path("new_addresses.csv"), emit: csv + path("versions.yml"), emit: versions + + script: + + def queryID = input_query[0].id + def outputFile = "new_addresses" + + def delimiter = in_format == "tsv" ? "\t" : (in_format == "csv" ? "," : in_format) + def out_delimiter = out_format == "tsv" ? "\t" : (out_format == "csv" ? "," : out_format) + def out_extension = out_format == "tsv" ? 'tsv' : 'csv' + + """ + # Filter the query samples only; keep only the 'id' and 'address' columns + csvtk filter2 \\ + ${addresses} \\ + --filter '\$id == \"$queryID\"' \\ + --delimiter "${delimiter}" \\ + --out-delimiter "${out_delimiter}" \\ + --out-file ${outputFile}.tmp + + csvtk cut -f 1,2 ${outputFile}.tmp > ${outputFile}.${out_extension} + rm ${outputFile}.tmp + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + csvtk: \$(echo \$( csvtk version | sed -e "s/csvtk v//g" )) + END_VERSIONS + """ +} + diff --git a/modules/local/input_check/main.nf b/modules/local/input_check/main.nf index 1c4ac08..94d9121 100644 --- a/modules/local/input_check/main.nf +++ b/modules/local/input_check/main.nf @@ -10,18 +10,21 @@ process INPUT_CHECK{ tuple val(meta), path(mlst) output: - path("*_error_report.csv"), optional: true, emit: sample_check + tuple val(meta), path("${meta.id}_match.txt"), path(mlst), emit: match + path("versions.yml"), emit: versions script: + """ input_check.py \\ - --input $mlst \\ + --input ${mlst} \\ --sample_id ${meta.id} \\ - --output ${meta.id}_error_report.csv + --output ${meta.id}_match.txt cat <<-END_VERSIONS > versions.yml "${task.process}": python: \$(python --version | sed 's/Python //g') END_VERSIONS """ + } diff --git a/modules/local/sample_filter/main.nf b/modules/local/sample_filter/main.nf new file mode 100644 index 0000000..7bdd35e --- /dev/null +++ b/modules/local/sample_filter/main.nf @@ -0,0 +1,30 @@ +process SAMPLE_FILTER { + tag "Filter Samples based on Metadata Conditions" + label 'process_single' + + container "${workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/python:3.8.3' : + 'biocontainers/python:3.8.3'}" + + input: + tuple val(meta), path(mlst) + + output: + tuple val(meta), path("${meta.id}.mlst.json"), optional: true, emit: out + path("versions.yml"), emit: versions + + script: + """ + filter_samples.py \\ + --id ${meta.id} \\ + --address ${meta.address} \\ + --id_match ${meta.id_match} \\ + --input ${mlst} \\ + --output ${meta.id}.mlst.json + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | sed 's/Python //g') + END_VERSIONS + """ +} From a7ca9b02224987507c424b98d69ae02d89f01c00 Mon Sep 17 00:00:00 2001 From: kylacochrane Date: Wed, 15 May 2024 18:10:43 -0400 Subject: [PATCH 032/119] Add corresposonding python scripts; remove unused scripts --- bin/check_samplesheet.py | 4 +++- bin/error_report.py | 41 +++++++++++++++++++++++++++++++ bin/filter_samples.py | 52 ++++++++++++++++++++++++++++++++++++++++ bin/input_check.py | 16 +++++-------- 4 files changed, 102 insertions(+), 11 deletions(-) create mode 100755 bin/error_report.py create mode 100755 bin/filter_samples.py diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py index 4a758fe..dabf3bc 100755 --- a/bin/check_samplesheet.py +++ b/bin/check_samplesheet.py @@ -195,7 +195,9 @@ def check_samplesheet(file_in, file_out): # Validate the existence of the expected header columns. if not required_columns.issubset(reader.fieldnames): req_cols = ", ".join(required_columns) - logger.critical(f"The sample sheet **must** contain these column headers: {req_cols}.") + logger.critical( + f"The sample sheet **must** contain these column headers: {req_cols}." + ) sys.exit(1) # Validate each row. checker = RowChecker() diff --git a/bin/error_report.py b/bin/error_report.py new file mode 100755 index 0000000..d5d3e82 --- /dev/null +++ b/bin/error_report.py @@ -0,0 +1,41 @@ +#!/usr/bin/env python + +import json +import argparse +import csv + + +def check_inputs(json_file, sample_id, address, output_file): + # Define a variable to store the match_status (True or False) + json_data = json.load(open(json_file)) + match_status = sample_id in json.load(open(json_file)) + + # Define error message based on address (query or reference) + if address == "null": + error_message = f"Query {sample_id} removed from pipeline" + else: + error_message = f"Pipeline stopped: Reference {sample_id}'s input ID and MLST JSON file key DO NOT MATCH" + + # Write match status to error report CSV + if not match_status: + with open(output_file, "w", newline="") as f: + writer = csv.writer(f) + writer.writerow(["sample", "JSON_key", "error_message"]) + writer.writerow([sample_id, list(json_data.keys())[0], error_message]) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Check sample inputs") + parser.add_argument("--input", help="Missing mlst.json file path", required=True) + parser.add_argument( + "--sample_id", help="Missing sample meta.id path", required=True + ) + parser.add_argument( + "--address", help="Missing sample meta.address path", required=True + ) + parser.add_argument( + "--output", help="Requires an error report file path", required=True + ) + args = parser.parse_args() + + check_inputs(args.input, args.sample_id, args.address, args.output) diff --git a/bin/filter_samples.py b/bin/filter_samples.py new file mode 100755 index 0000000..11017c2 --- /dev/null +++ b/bin/filter_samples.py @@ -0,0 +1,52 @@ +#!/usr/bin/env python + +import argparse +import json +import os +import sys + + +def process_input(id, address, id_match, input_file, output_file): + try: + # Load JSON data from input file + with open(input_file, "r") as json_file: + data = json.load(json_file) + + if id_match == "True": + print("ID match is True. Outputting the same tuple.", file=sys.stdout) + with open(output_file, "w") as output: + json.dump(data, output) + elif address == "null" and id_match == "False": + print("Query sample removed from analysis.", file=sys.stdout) + # Remove the input file to indicate this sample should be excluded + os.remove(input_file) + elif id_match == "False": + print( + "Pipeline stopped: Reference sample ID and MLST JSON file key DO NOT MATCH.", + file=sys.stderr, + ) + sys.exit(1) + else: + print("Unhandled case in input conditions.", file=sys.stderr) + sys.exit(1) + except Exception as e: + print(f"An error occurred: {e}", file=sys.stderr) + sys.exit(1) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Process input tuple.") + parser.add_argument("--id", type=str, required=True, help="Sample ID") + parser.add_argument("--address", type=str, required=True, help="Cluster Address") + parser.add_argument("--id_match", type=str, required=True, help="ID Match Boolean") + parser.add_argument( + "--input", type=str, required=True, help="Path to the input JSON file" + ) + parser.add_argument( + "--output", type=str, required=True, help="Path to the output file" + ) + + args = parser.parse_args() + + # Process input + process_input(args.id, args.address, args.id_match, args.input, args.output) diff --git a/bin/input_check.py b/bin/input_check.py index 0a390f7..ec8b8d9 100755 --- a/bin/input_check.py +++ b/bin/input_check.py @@ -2,20 +2,16 @@ import json import argparse -import csv +import sys def check_inputs(json_file, sample_id, output_file): # Define a variable to store the match status - json_data = json.load(open(json_file)) - match_status = sample_id in json_data + match_status = sample_id in json.load(open(json_file)) - # Write match status to error report CSV - if not match_status: - with open(output_file, "w", newline="") as f: - writer = csv.writer(f) - writer.writerow(["sample", "Sample ID matches MLST.JSON key?"]) - writer.writerow([sample_id, match_status]) + # Write match status to file + with open(output_file, "w") as f: + f.write(str(match_status)) if __name__ == "__main__": @@ -25,7 +21,7 @@ def check_inputs(json_file, sample_id, output_file): "--sample_id", help="Missing sample meta.id path", required=True ) parser.add_argument( - "--output", help="Requires an error report file path", required=True + "--output", help="Missing match_status file path", required=True ) args = parser.parse_args() From d0829d4b3ced5774c101a4e435e943b816466e6b Mon Sep 17 00:00:00 2001 From: kylacochrane Date: Wed, 15 May 2024 18:11:33 -0400 Subject: [PATCH 033/119] Update iridanext.config to include error_reports --- conf/iridanext.config | 3 +++ 1 file changed, 3 insertions(+) diff --git a/conf/iridanext.config b/conf/iridanext.config index b362249..d0dc84c 100644 --- a/conf/iridanext.config +++ b/conf/iridanext.config @@ -3,6 +3,9 @@ iridanext { output { path = "${params.outdir}/iridanext.output.json.gz" overwrite = true + files { + samples = ["**/error/*_error_report.csv"] + } metadata { samples { csv { From c616b5b6c3581a8d6ec140c3ab10ad0d706f497f Mon Sep 17 00:00:00 2001 From: kylacochrane Date: Wed, 15 May 2024 18:11:50 -0400 Subject: [PATCH 034/119] Updated workflow --- workflows/gas_nomenclature.nf | 34 +++++++++++++++++++++++----------- 1 file changed, 23 insertions(+), 11 deletions(-) diff --git a/workflows/gas_nomenclature.nf b/workflows/gas_nomenclature.nf index aebc407..f5ce43c 100644 --- a/workflows/gas_nomenclature.nf +++ b/workflows/gas_nomenclature.nf @@ -23,11 +23,13 @@ include { paramsSummaryLog; paramsSummaryMap; fromSamplesheet } from 'plugin/nf // include { INPUT_CHECK } from "../modules/local/input_check/main" +include { ERROR_REPORT } from "../modules/local/error_report/main" +include { SAMPLE_FILTER } from "../modules/local/sample_filter/main" include { LOCIDEX_MERGE as LOCIDEX_MERGE_REF } from "../modules/local/locidex/merge/main" include { LOCIDEX_MERGE as LOCIDEX_MERGE_QUERY } from "../modules/local/locidex/merge/main" include { PROFILE_DISTS } from "../modules/local/profile_dists/main" include { GAS_CALL } from "../modules/local/gas/call/main" -include { FILTER_QUERY } from "../modules/local/filter/main" +include { FILTER_QUERY } from "../modules/local/filter_query/main" /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -71,16 +73,31 @@ workflow GAS_NOMENCLATURE { // NB: `input` corresponds to `params.input` and associated sample sheet schema input = Channel.fromSamplesheet("input") - // Ensure meta.id and mlst_file keys match - mlst_merged = INPUT_CHECK(input) + // Ensure meta.id and mlst_file keys match; generate error report for samples where id ≠ key + id_key = INPUT_CHECK(input) + ch_versions = ch_versions.mix(id_key.versions) + + error_report = ERROR_REPORT(input) + ch_versions = ch_versions.mix(error_report.versions) + + // Update metadata to include the id_key.match data + match = id_key.match.map { meta, file, json -> + def id_match = file.text.trim() + [meta + [id_match: id_match], json] + } + + // If samples have a disparity between meta.id and JSON key: Exclude the queried samples OR halt the pipeline with an error if sample has an associated cluster address (reference) + filtered = SAMPLE_FILTER(match) + ch_versions = ch_versions.mix(filtered.versions) + + new_input = filtered.out // Prepare reference and query TSV files for LOCIDEX_MERGE - profiles = input.branch{ + profiles = new_input.branch{ ref: it[0].address query: !it[0].address errors: true // To discuss, add in check on file for erroneous values, may not be needed as nf-validation is working } - reference_values = input.collect{ meta, profile -> profile} query_values = profiles.query.collect{ meta, profile -> profile } @@ -101,7 +118,6 @@ workflow GAS_NOMENCLATURE { exit 1, "${params.pd_mapping_file}: Does not exist but was passed to the pipeline. Exiting now." } - columns_file = prepareFilePath(params.pd_columns, "Selecting ${params.pd_columns} for --pd_mapping_file") if(columns_file == null){ exit 1, "${params.pd_columns}: Does not exist but was passed to the pipeline. Exiting now." @@ -114,14 +130,12 @@ workflow GAS_NOMENCLATURE { mapping_format, mapping_file, columns_file) - ch_versions = ch_versions.mix(distances.versions) // GAS CALL - clusters = Channel.fromPath(params.ref_clusters, checkIfExists: true) - called_data = GAS_CALL(clusters, distances.results) + called_data = GAS_CALL(clusters, distances.results) ch_versions = ch_versions.mix(called_data.versions) // Filter the new queried samples and addresses into a CSV/JSON file for the IRIDANext plug in @@ -134,8 +148,6 @@ workflow GAS_NOMENCLATURE { } - - /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ THE END From 0059a9b82fa1e9fac92c079c672cb2cdc3cd93d1 Mon Sep 17 00:00:00 2001 From: kylacochrane Date: Thu, 16 May 2024 13:50:10 -0400 Subject: [PATCH 035/119] Uploading files for new integration tests --- tests/data/irida/test2_iridanext.output.json | 21 +++++++++++++++++++ tests/data/irida/test_iridanext.output.json | 17 +++++++++++++++ tests/data/reports/sample7.mlst.json | 7 +++++++ tests/data/reports/sampleF.mlst.json | 7 +++++++ tests/data/samplesheets/samplesheet_test1.csv | 6 ++++++ tests/data/samplesheets/samplesheet_test2.csv | 7 +++++++ 6 files changed, 65 insertions(+) create mode 100644 tests/data/irida/test2_iridanext.output.json create mode 100644 tests/data/irida/test_iridanext.output.json create mode 100644 tests/data/reports/sample7.mlst.json create mode 100644 tests/data/reports/sampleF.mlst.json create mode 100644 tests/data/samplesheets/samplesheet_test1.csv create mode 100644 tests/data/samplesheets/samplesheet_test2.csv diff --git a/tests/data/irida/test2_iridanext.output.json b/tests/data/irida/test2_iridanext.output.json new file mode 100644 index 0000000..fdc7af5 --- /dev/null +++ b/tests/data/irida/test2_iridanext.output.json @@ -0,0 +1,21 @@ +{ + "files": { + "global": [ + + ], + "samples": { + "sampleR": [ + { + "path": "error/sampleR_error_report.csv" + } + ] + } + }, + "metadata": { + "samples": { + "sampleQ": { + "address": "1.1.3" + } + } + } +} diff --git a/tests/data/irida/test_iridanext.output.json b/tests/data/irida/test_iridanext.output.json new file mode 100644 index 0000000..5ba3041 --- /dev/null +++ b/tests/data/irida/test_iridanext.output.json @@ -0,0 +1,17 @@ +{ + "files": { + "global": [ + + ], + "samples": { + + } + }, + "metadata": { + "samples": { + "sampleQ": { + "address": "1.1.3" + } + } + } +} diff --git a/tests/data/reports/sample7.mlst.json b/tests/data/reports/sample7.mlst.json new file mode 100644 index 0000000..41d6312 --- /dev/null +++ b/tests/data/reports/sample7.mlst.json @@ -0,0 +1,7 @@ +{ + "sample7": { + "l1": "1", + "l2": "1", + "l3": "1" + } +} diff --git a/tests/data/reports/sampleF.mlst.json b/tests/data/reports/sampleF.mlst.json new file mode 100644 index 0000000..8c09d39 --- /dev/null +++ b/tests/data/reports/sampleF.mlst.json @@ -0,0 +1,7 @@ +{ + "sampleF": { + "l1": "1", + "l2": "2", + "l3": "1" + } +} diff --git a/tests/data/samplesheets/samplesheet_test1.csv b/tests/data/samplesheets/samplesheet_test1.csv new file mode 100644 index 0000000..094ed0f --- /dev/null +++ b/tests/data/samplesheets/samplesheet_test1.csv @@ -0,0 +1,6 @@ +sample,mlst_alleles,address +sampleQ,https://raw.githubusercontent.com/phac-nml/gasnomenclature/add_tests/tests/data/reports/sampleF.mlst.json, +sample1,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample1.mlst.json,1.1.1 +sample2,https://raw.githubusercontent.com/phac-nml/gasnomenclature/add_tests/tests/data/reports/sample7.mlst.json,1.1.1 +sample3,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample3.mlst.json,1.1.2 + diff --git a/tests/data/samplesheets/samplesheet_test2.csv b/tests/data/samplesheets/samplesheet_test2.csv new file mode 100644 index 0000000..ef952c3 --- /dev/null +++ b/tests/data/samplesheets/samplesheet_test2.csv @@ -0,0 +1,7 @@ +sample,mlst_alleles,address +sampleR,https://raw.githubusercontent.com/phac-nml/gasnomenclature/add_tests/tests/data/reports/sampleF.mlst.json, +sampleQ,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sampleQ.mlst.json, +sample1,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample1.mlst.json,1.1.1 +sample2,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample2.mlst.json,1.1.1 +sample3,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample3.mlst.json,1.1.2 + From fd90fa1b5c0c7bf9bfead17f7644621ccc2d77ec Mon Sep 17 00:00:00 2001 From: kylacochrane Date: Thu, 16 May 2024 14:02:49 -0400 Subject: [PATCH 036/119] Added new test and configuration files; fixed EC issues --- tests/data/irida/test2_iridanext.output.json | 4 +- tests/data/irida/test_iridanext.output.json | 8 +- tests/nextflow.config | 4 + tests/pipelines/main.nf.test | 77 +++++++++++++++++++- 4 files changed, 82 insertions(+), 11 deletions(-) diff --git a/tests/data/irida/test2_iridanext.output.json b/tests/data/irida/test2_iridanext.output.json index fdc7af5..5d0c533 100644 --- a/tests/data/irida/test2_iridanext.output.json +++ b/tests/data/irida/test2_iridanext.output.json @@ -1,8 +1,6 @@ { "files": { - "global": [ - - ], + "global": [], "samples": { "sampleR": [ { diff --git a/tests/data/irida/test_iridanext.output.json b/tests/data/irida/test_iridanext.output.json index 5ba3041..3d0bfb5 100644 --- a/tests/data/irida/test_iridanext.output.json +++ b/tests/data/irida/test_iridanext.output.json @@ -1,11 +1,7 @@ { "files": { - "global": [ - - ], - "samples": { - - } + "global": [], + "samples": {} }, "metadata": { "samples": { diff --git a/tests/nextflow.config b/tests/nextflow.config index 672dc69..2e79f3c 100644 --- a/tests/nextflow.config +++ b/tests/nextflow.config @@ -16,3 +16,7 @@ due to issues with multiprocessing in the container. A similar error is found at https://github.com/marcelm/cutadapt/issues/583 */ singularity.runOptions = "--contain" + +/* Remove gzipping on JSON output for testing/asserts on file contents +*/ +iridanext.output.path = "${params.outdir}/iridanext.output.json" diff --git a/tests/pipelines/main.nf.test b/tests/pipelines/main.nf.test index 911af92..eda1da6 100644 --- a/tests/pipelines/main.nf.test +++ b/tests/pipelines/main.nf.test @@ -4,7 +4,7 @@ nextflow_pipeline { script "main.nf" test("Small-scale test of full pipeline"){ - tag "pipeline" + tag "pipeline_success" when{ params { @@ -23,7 +23,6 @@ nextflow_pipeline { def expected_profile_tsv = path("$baseDir/tests/data/profiles/expected-profile1.tsv") assert actual_profile_ref.text == expected_profile_tsv.text - // Check computed pairwise distances def actual_distances = path("$launchDir/results/distances/results.text") def expected_distances = path("$baseDir/tests/data/distances/expected_pairwise_dists.txt") @@ -33,8 +32,82 @@ nextflow_pipeline { def actual_calls = path("$launchDir/results/call/Called/results.text") def expected_calls = path("$baseDir/tests/data/called/expected_results.txt") assert actual_calls.text == expected_calls.text + + // Check IRIDA Next JSON output + assert path("$launchDir/results/iridanext.output.json").json == path("$baseDir/tests/data/irida/test_iridanext.output.json").json + + def iridanext_json = path("$launchDir/results/iridanext.output.json").json + def iridanext_metadata = iridanext_json.metadata.samples + + assert iridanext_metadata.sampleQ."address" == "1.1.3" + } + } + + test("Integration test where input contains reference sample with mismatched MLST JSON file"){ + tag "pipeline_failure" + + when { + params { + input = "$baseDir/tests/data/samplesheets/samplesheet_test1.csv" + outdir = "results" + } + } + + then { + assert workflow.failed + assert (workflow.stdout =~ /Pipeline stopped: Reference sample ID and MLST JSON file key DO NOT MATCH/).find() + + assert path("$launchDir/results").exists() + assert path("$launchDir/results/error").exists() + + // Ensure that despite pipeline failure, error_reports are generated for all samples added to pipeline (i.e. sampleQ query) + def lines = [] + + lines = path("$launchDir/results/error/sample2_error_report.csv").readLines() + assert lines.contains("sample2,sample7,Pipeline stopped: Reference sample2's input ID and MLST JSON file key DO NOT MATCH") + + lines = path("$launchDir/results/error/sampleQ_error_report.csv").readLines() + assert lines.contains("sampleQ,sampleF,Query sampleQ removed from pipeline") } } + test("Integration test where input contains a single query sample with mismatched MLST JSON file"){ + tag "pipeline_success_after_query_removal" + + when{ + params { + input = "$baseDir/tests/data/samplesheets/samplesheet_test2.csv" + outdir = "results" + } + } + + then { + assert workflow.success + assert path("$launchDir/results").exists() + assert path("$launchDir/results/error").exists() + assert path("$launchDir/results/filter").exists() + + // Check outputs + def lines = [] + + // Ensure that the error_report is generated for removed query sampleR + lines = path("$launchDir/results/error/sampleR_error_report.csv").readLines() + assert lines.contains("sampleR,sampleF,Query sampleR removed from pipeline") + + // Check query output csv + lines = path("$launchDir/results/filter/new_addresses.csv").readLines() + assert lines.contains("sampleQ,1.1.3") + + // Check IRIDA Next JSON output + assert path("$launchDir/results/iridanext.output.json").json == path("$baseDir/tests/data/irida/test2_iridanext.output.json").json + + def iridanext_json = path("$launchDir/results/iridanext.output.json").json + def iridanext_samples = iridanext_json.files.samples + def iridanext_metadata = iridanext_json.metadata.samples + + assert iridanext_samples.sampleR.findAll { it.path == "error/sampleR_error_report.csv" }.size() == 1 + assert iridanext_metadata.sampleQ."address" == "1.1.3" + } + } } From df524f689554174381044291eba4af96ad2f2a53 Mon Sep 17 00:00:00 2001 From: kylacochrane Date: Thu, 16 May 2024 15:18:13 -0400 Subject: [PATCH 037/119] Improvements to the FILTER_QUERY module --- modules/local/filter/main.nf | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/modules/local/filter/main.nf b/modules/local/filter/main.nf index 305af37..72aacbc 100644 --- a/modules/local/filter/main.nf +++ b/modules/local/filter/main.nf @@ -13,7 +13,7 @@ process FILTER_QUERY { val out_format output: - path("new_addresses.csv"), emit: csv + path("new_addresses.*"), emit: csv path("versions.yml"), emit: versions script: @@ -31,12 +31,9 @@ process FILTER_QUERY { ${addresses} \\ --filter '\$id == \"$queryID\"' \\ --delimiter "${delimiter}" \\ - --out-delimiter "${out_delimiter}" \\ - --out-file ${outputFile}.tmp - - csvtk cut -f 1,2 ${outputFile}.tmp > ${outputFile}.${out_extension} - rm ${outputFile}.tmp - + --out-delimiter "${out_delimiter}" | \\ + csvtk cut -f id,address > ${outputFile}.${out_extension} + cat <<-END_VERSIONS > versions.yml "${task.process}": csvtk: \$(echo \$( csvtk version | sed -e "s/csvtk v//g" )) From 9e50182f2496da3d3ea3c1ca5de876681551a9a5 Mon Sep 17 00:00:00 2001 From: kylacochrane Date: Thu, 16 May 2024 15:20:06 -0400 Subject: [PATCH 038/119] EC fix --- modules/local/filter/main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/local/filter/main.nf b/modules/local/filter/main.nf index 72aacbc..5bb4b17 100644 --- a/modules/local/filter/main.nf +++ b/modules/local/filter/main.nf @@ -33,7 +33,7 @@ process FILTER_QUERY { --delimiter "${delimiter}" \\ --out-delimiter "${out_delimiter}" | \\ csvtk cut -f id,address > ${outputFile}.${out_extension} - + cat <<-END_VERSIONS > versions.yml "${task.process}": csvtk: \$(echo \$( csvtk version | sed -e "s/csvtk v//g" )) From e4d5031d39e276a444dc281b93f5b3a4c39e3ff8 Mon Sep 17 00:00:00 2001 From: kylacochrane Date: Thu, 16 May 2024 15:39:17 -0400 Subject: [PATCH 039/119] Fixed discrepancy with previous merged from filter-new-addresses into filter_query --- modules/local/filter/main.nf | 43 ------------------------------ modules/local/filter_query/main.nf | 9 +++---- 2 files changed, 3 insertions(+), 49 deletions(-) delete mode 100644 modules/local/filter/main.nf diff --git a/modules/local/filter/main.nf b/modules/local/filter/main.nf deleted file mode 100644 index 5bb4b17..0000000 --- a/modules/local/filter/main.nf +++ /dev/null @@ -1,43 +0,0 @@ -process FILTER_QUERY { - tag "Filter New Query Addresses" - label 'process_single' - - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/csvtk:0.22.0--h9ee0642_1' : - 'biocontainers/csvtk:0.22.0--h9ee0642_1' }" - - input: - val input_query - path addresses - val in_format - val out_format - - output: - path("new_addresses.*"), emit: csv - path("versions.yml"), emit: versions - - script: - - def queryID = input_query[0].id - def outputFile = "new_addresses" - - def delimiter = in_format == "tsv" ? "\t" : (in_format == "csv" ? "," : in_format) - def out_delimiter = out_format == "tsv" ? "\t" : (out_format == "csv" ? "," : out_format) - def out_extension = out_format == "tsv" ? 'tsv' : 'csv' - - """ - # Filter the query samples only; keep only the 'id' and 'address' columns - csvtk filter2 \\ - ${addresses} \\ - --filter '\$id == \"$queryID\"' \\ - --delimiter "${delimiter}" \\ - --out-delimiter "${out_delimiter}" | \\ - csvtk cut -f id,address > ${outputFile}.${out_extension} - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - csvtk: \$(echo \$( csvtk version | sed -e "s/csvtk v//g" )) - END_VERSIONS - """ -} - diff --git a/modules/local/filter_query/main.nf b/modules/local/filter_query/main.nf index 305af37..5bb4b17 100644 --- a/modules/local/filter_query/main.nf +++ b/modules/local/filter_query/main.nf @@ -13,7 +13,7 @@ process FILTER_QUERY { val out_format output: - path("new_addresses.csv"), emit: csv + path("new_addresses.*"), emit: csv path("versions.yml"), emit: versions script: @@ -31,11 +31,8 @@ process FILTER_QUERY { ${addresses} \\ --filter '\$id == \"$queryID\"' \\ --delimiter "${delimiter}" \\ - --out-delimiter "${out_delimiter}" \\ - --out-file ${outputFile}.tmp - - csvtk cut -f 1,2 ${outputFile}.tmp > ${outputFile}.${out_extension} - rm ${outputFile}.tmp + --out-delimiter "${out_delimiter}" | \\ + csvtk cut -f id,address > ${outputFile}.${out_extension} cat <<-END_VERSIONS > versions.yml "${task.process}": From 5bfe569ec975ee4b38dc91adec005541f9f314a4 Mon Sep 17 00:00:00 2001 From: kylacochrane Date: Thu, 16 May 2024 16:42:47 -0400 Subject: [PATCH 040/119] Remove left-overs from pipeline template --- bin/check_samplesheet.py | 261 ----------------------------- bin/irida-next-output.py | 93 ---------- bin/simplify_irida_json.py | 77 --------- modules/local/samplesheet_check.nf | 31 ---- subworkflows/local/input_check.nf | 44 ----- 5 files changed, 506 deletions(-) delete mode 100755 bin/check_samplesheet.py delete mode 100755 bin/irida-next-output.py delete mode 100755 bin/simplify_irida_json.py delete mode 100644 modules/local/samplesheet_check.nf delete mode 100644 subworkflows/local/input_check.nf diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py deleted file mode 100755 index dabf3bc..0000000 --- a/bin/check_samplesheet.py +++ /dev/null @@ -1,261 +0,0 @@ -#!/usr/bin/env python - - -"""Provide a command line tool to validate and transform tabular samplesheets.""" - - -import argparse -import csv -import logging -import sys -from collections import Counter -from pathlib import Path - -logger = logging.getLogger() - - -class RowChecker: - """ - Define a service that can validate and transform each given row. - - Attributes: - modified (list): A list of dicts, where each dict corresponds to a previously - validated and transformed row. The order of rows is maintained. - - """ - - VALID_FORMATS = ( - ".fq.gz", - ".fastq.gz", - ) - - def __init__( - self, - sample_col="sample", - first_col="fastq_1", - second_col="fastq_2", - single_col="single_end", - **kwargs, - ): - """ - Initialize the row checker with the expected column names. - - Args: - sample_col (str): The name of the column that contains the sample name - (default "sample"). - first_col (str): The name of the column that contains the first (or only) - FASTQ file path (default "fastq_1"). - second_col (str): The name of the column that contains the second (if any) - FASTQ file path (default "fastq_2"). - single_col (str): The name of the new column that will be inserted and - records whether the sample contains single- or paired-end sequencing - reads (default "single_end"). - - """ - super().__init__(**kwargs) - self._sample_col = sample_col - self._first_col = first_col - self._second_col = second_col - self._single_col = single_col - self._seen = set() - self.modified = [] - - def validate_and_transform(self, row): - """ - Perform all validations on the given row and insert the read pairing status. - - Args: - row (dict): A mapping from column headers (keys) to elements of that row - (values). - - """ - self._validate_sample(row) - self._validate_first(row) - self._validate_second(row) - self._validate_pair(row) - self._seen.add((row[self._sample_col], row[self._first_col])) - self.modified.append(row) - - def _validate_sample(self, row): - """Assert that the sample name exists and convert spaces to underscores.""" - if len(row[self._sample_col]) <= 0: - raise AssertionError("Sample input is required.") - # Sanitize samples slightly. - row[self._sample_col] = row[self._sample_col].replace(" ", "_") - - def _validate_first(self, row): - """Assert that the first FASTQ entry is non-empty and has the right format.""" - if len(row[self._first_col]) <= 0: - raise AssertionError("At least the first FASTQ file is required.") - self._validate_fastq_format(row[self._first_col]) - - def _validate_second(self, row): - """Assert that the second FASTQ entry has the right format if it exists.""" - if len(row[self._second_col]) > 0: - self._validate_fastq_format(row[self._second_col]) - - def _validate_pair(self, row): - """Assert that read pairs have the same file extension. Report pair status.""" - if row[self._first_col] and row[self._second_col]: - row[self._single_col] = False - first_col_suffix = Path(row[self._first_col]).suffixes[-2:] - second_col_suffix = Path(row[self._second_col]).suffixes[-2:] - if first_col_suffix != second_col_suffix: - raise AssertionError("FASTQ pairs must have the same file extensions.") - else: - row[self._single_col] = True - - def _validate_fastq_format(self, filename): - """Assert that a given filename has one of the expected FASTQ extensions.""" - if not any(filename.endswith(extension) for extension in self.VALID_FORMATS): - raise AssertionError( - f"The FASTQ file has an unrecognized extension: {filename}\n" - f"It should be one of: {', '.join(self.VALID_FORMATS)}" - ) - - def validate_unique_samples(self): - """ - Assert that the combination of sample name and FASTQ filename is unique. - - In addition to the validation, also rename all samples to have a suffix of _T{n}, where n is the - number of times the same sample exist, but with different FASTQ files, e.g., multiple runs per experiment. - - """ - if len(self._seen) != len(self.modified): - raise AssertionError("The pair of sample name and FASTQ must be unique.") - seen = Counter() - for row in self.modified: - sample = row[self._sample_col] - seen[sample] += 1 - row[self._sample_col] = f"{sample}_T{seen[sample]}" - - -def read_head(handle, num_lines=10): - """Read the specified number of lines from the current position in the file.""" - lines = [] - for idx, line in enumerate(handle): - if idx == num_lines: - break - lines.append(line) - return "".join(lines) - - -def sniff_format(handle): - """ - Detect the tabular format. - - Args: - handle (text file): A handle to a `text file`_ object. The read position is - expected to be at the beginning (index 0). - - Returns: - csv.Dialect: The detected tabular format. - - .. _text file: - https://docs.python.org/3/glossary.html#term-text-file - - """ - peek = read_head(handle) - handle.seek(0) - sniffer = csv.Sniffer() - dialect = sniffer.sniff(peek) - return dialect - - -def check_samplesheet(file_in, file_out): - """ - Check that the tabular samplesheet has the structure expected by nf-core pipelines. - - Validate the general shape of the table, expected columns, and each row. Also add - an additional column which records whether one or two FASTQ reads were found. - - Args: - file_in (pathlib.Path): The given tabular samplesheet. The format can be either - CSV, TSV, or any other format automatically recognized by ``csv.Sniffer``. - file_out (pathlib.Path): Where the validated and transformed samplesheet should - be created; always in CSV format. - - Example: - This function checks that the samplesheet follows the following structure, - see also the `viral recon samplesheet`_:: - - sample,fastq_1,fastq_2 - SAMPLE_PE,SAMPLE_PE_RUN1_1.fastq.gz,SAMPLE_PE_RUN1_2.fastq.gz - SAMPLE_PE,SAMPLE_PE_RUN2_1.fastq.gz,SAMPLE_PE_RUN2_2.fastq.gz - SAMPLE_SE,SAMPLE_SE_RUN1_1.fastq.gz, - - .. _viral recon samplesheet: - https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/samplesheet/samplesheet_test_illumina_amplicon.csv - - """ - required_columns = {"sample", "fastq_1", "fastq_2"} - # See https://docs.python.org/3.9/library/csv.html#id3 to read up on `newline=""`. - with file_in.open(newline="") as in_handle: - reader = csv.DictReader(in_handle, dialect=sniff_format(in_handle)) - # Validate the existence of the expected header columns. - if not required_columns.issubset(reader.fieldnames): - req_cols = ", ".join(required_columns) - logger.critical( - f"The sample sheet **must** contain these column headers: {req_cols}." - ) - sys.exit(1) - # Validate each row. - checker = RowChecker() - for i, row in enumerate(reader): - try: - checker.validate_and_transform(row) - except AssertionError as error: - logger.critical(f"{str(error)} On line {i + 2}.") - sys.exit(1) - checker.validate_unique_samples() - header = list(reader.fieldnames) - header.insert(1, "single_end") - # See https://docs.python.org/3.9/library/csv.html#id3 to read up on `newline=""`. - with file_out.open(mode="w", newline="") as out_handle: - writer = csv.DictWriter(out_handle, header, delimiter=",") - writer.writeheader() - for row in checker.modified: - writer.writerow(row) - - -def parse_args(argv=None): - """Define and immediately parse command line arguments.""" - parser = argparse.ArgumentParser( - description="Validate and transform a tabular samplesheet.", - epilog="Example: python check_samplesheet.py samplesheet.csv samplesheet.valid.csv", - ) - parser.add_argument( - "file_in", - metavar="FILE_IN", - type=Path, - help="Tabular input samplesheet in CSV or TSV format.", - ) - parser.add_argument( - "file_out", - metavar="FILE_OUT", - type=Path, - help="Transformed output samplesheet in CSV format.", - ) - parser.add_argument( - "-l", - "--log-level", - help="The desired log level (default WARNING).", - choices=("CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG"), - default="WARNING", - ) - return parser.parse_args(argv) - - -def main(argv=None): - """Coordinate argument parsing and program execution.""" - args = parse_args(argv) - logging.basicConfig(level=args.log_level, format="[%(levelname)s] %(message)s") - if not args.file_in.is_file(): - logger.error(f"The given input file {args.file_in} was not found!") - sys.exit(2) - args.file_out.parent.mkdir(parents=True, exist_ok=True) - check_samplesheet(args.file_in, args.file_out) - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/bin/irida-next-output.py b/bin/irida-next-output.py deleted file mode 100755 index 32acd36..0000000 --- a/bin/irida-next-output.py +++ /dev/null @@ -1,93 +0,0 @@ -#!/usr/bin/env python - -import json -from pathlib import Path -from mimetypes import guess_type -from functools import partial -import gzip -import sys -import argparse -import os -import glob - - -def get_open(f): - if "gzip" == guess_type(str(f))[1]: - return partial(gzip.open) - else: - return open - - -def main(argv=None): - parser = argparse.ArgumentParser( - description="Creates example output JSON for loading into IRIDA Next", - epilog="Example: python irida-next-output.py --json-output output.json *.json *.json.gz", - ) - parser.add_argument("files", nargs="+") - parser.add_argument( - "--summary-file", - action="store", - dest="summary_file", - type=str, - help="pipeline summary file", - default=None, - required=True, - ) - parser.add_argument( - "--json-output", - action="store", - dest="json_output", - type=str, - help="JSON output file", - default=None, - required=True, - ) - - args = parser.parse_args(argv) - - json_output_file = Path(args.json_output) - if json_output_file.exists(): - sys.stderr.write(f"Error: --json-output [{json_output_file}] exists") - return 1 - - # Not checking for the existance of the summary file - # because the path may be relative to the outdir, which we don't have here. - - input_files = args.files - if isinstance(input_files, str): - input_files = [input_files] - - output_dict = { - "files": { - "summary": {}, - "samples": {}, - }, - "metadata": { - "samples": {}, - }, - } - - output_metadata = { - "files": {"global": [{"path": str(args.summary_file)}], "samples": {}}, - "metadata": {"samples": {}}, - } - - for f in input_files: - _open = get_open(f) - with _open(f, "r") as fh: - sample_metadata = json.load(fh) - output_metadata["files"]["samples"] |= sample_metadata["files"]["samples"] - output_metadata["metadata"]["samples"] |= sample_metadata["metadata"]["samples"] - - data_json = json.dumps(output_metadata, sort_keys=True, indent=4) - _open = get_open(json_output_file) - with _open(json_output_file, "wt") as oh: - oh.write(data_json) - - print(f"Output written to [{json_output_file}]") - - return 0 - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/bin/simplify_irida_json.py b/bin/simplify_irida_json.py deleted file mode 100755 index c486625..0000000 --- a/bin/simplify_irida_json.py +++ /dev/null @@ -1,77 +0,0 @@ -#!/usr/bin/env python - -import json -import argparse -import sys -import gzip -from mimetypes import guess_type -from functools import partial -from pathlib import Path - - -def flatten_dictionary(dictionary): - result = {} - - def flatten(item, name=""): - if type(item) is dict: - for component in item: - flatten(item[component], str(name) + str(component) + ".") - - elif type(item) is list: - for i in range(len(item)): - flatten(item[i], str(name) + str(i + 1) + ".") # i + 1 because biologists - - else: - result[str(name)[:-1]] = item # [:-1] avoids the "." appended on the previous recursion - - flatten(dictionary) - return result - - -def main(): - parser = argparse.ArgumentParser( - description="Simplifies JSON files for use with IRIDA Next", - epilog="Example: python simplify_irida_json.py --json-output output.json input.json", - ) - parser.add_argument("input") - parser.add_argument( - "--json-output", - action="store", - dest="json_output", - type=str, - help="JSON output file", - default=None, - required=True, - ) - - args = parser.parse_args() - - json_output_location = Path(args.json_output) - if json_output_location.exists(): - sys.stderr.write("Error: --json-output [{json_output_location}] exists!\n") - return 1 - - json_input_file = args.input - - # Handle GZIP and non-GZIP - encoding = guess_type(json_input_file)[1] - open_file = partial(gzip.open, mode="rt") if encoding == "gzip" else open # partial (function pointer) - - with open_file(json_input_file) as input_file: - input_json = json.load(input_file) - - # Flatten metadata: - for sample in input_json["metadata"]["samples"]: - input_json["metadata"]["samples"][sample] = flatten_dictionary(input_json["metadata"]["samples"][sample]) - - json_data = json.dumps(input_json, sort_keys=True, indent=4) - with open(json_output_location, "w") as output_file: - output_file.write(json_data) - - print("Output written to " + str(json_output_location) + "!") - - return 0 - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/modules/local/samplesheet_check.nf b/modules/local/samplesheet_check.nf deleted file mode 100644 index 6c1c1f4..0000000 --- a/modules/local/samplesheet_check.nf +++ /dev/null @@ -1,31 +0,0 @@ -process SAMPLESHEET_CHECK { - tag "$samplesheet" - label 'process_single' - - conda "conda-forge::python=3.8.3" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/python:3.8.3' : - 'biocontainers/python:3.8.3' }" - - input: - path samplesheet - - output: - path '*.csv' , emit: csv - path "versions.yml", emit: versions - - when: - task.ext.when == null || task.ext.when - - script: // This script is bundled with the pipeline, in phac-nml/gasnomenclature/bin/ - """ - check_samplesheet.py \\ - $samplesheet \\ - samplesheet.valid.csv - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - python: \$(python --version | sed 's/Python //g') - END_VERSIONS - """ -} diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf deleted file mode 100644 index 0aecf87..0000000 --- a/subworkflows/local/input_check.nf +++ /dev/null @@ -1,44 +0,0 @@ -// -// Check input samplesheet and get read channels -// - -include { SAMPLESHEET_CHECK } from '../../modules/local/samplesheet_check' - -workflow INPUT_CHECK { - take: - samplesheet // file: /path/to/samplesheet.csv - - main: - SAMPLESHEET_CHECK ( samplesheet ) - .csv - .splitCsv ( header:true, sep:',' ) - .map { create_fastq_channel(it) } - .set { reads } - - emit: - reads // channel: [ val(meta), [ reads ] ] - versions = SAMPLESHEET_CHECK.out.versions // channel: [ versions.yml ] -} - -// Function to get list of [ meta, [ fastq_1, fastq_2 ] ] -def create_fastq_channel(LinkedHashMap row) { - // create meta map - def meta = [:] - meta.id = row.sample - meta.single_end = row.single_end.toBoolean() - - // add path(s) of the fastq file(s) to the meta map - def fastq_meta = [] - if (!file(row.fastq_1).exists()) { - exit 1, "ERROR: Please check input samplesheet -> Read 1 FastQ file does not exist!\n${row.fastq_1}" - } - if (meta.single_end) { - fastq_meta = [ meta, [ file(row.fastq_1) ] ] - } else { - if (!file(row.fastq_2).exists()) { - exit 1, "ERROR: Please check input samplesheet -> Read 2 FastQ file does not exist!\n${row.fastq_2}" - } - fastq_meta = [ meta, [ file(row.fastq_1), file(row.fastq_2) ] ] - } - return fastq_meta -} From b4e8d25f38d823d74c56b768deef12a2af35d459 Mon Sep 17 00:00:00 2001 From: kylacochrane Date: Thu, 16 May 2024 16:45:39 -0400 Subject: [PATCH 041/119] Fixed linting issue with modules.config --- conf/modules.config | 8 -------- 1 file changed, 8 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 3b92819..a898c53 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -27,14 +27,6 @@ process { saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] - withName: SAMPLESHEET_CHECK { - publishDir = [ - path: { "${params.outdir}/pipeline_info" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - } - withName: LOCIDEX_MERGE_REF { publishDir = [ path: locidex_merge_ref_directory_name, From cf37bfea9b84bb97ae5df045aee57ff98a3f8d91 Mon Sep 17 00:00:00 2001 From: kylacochrane Date: Thu, 16 May 2024 17:18:51 -0400 Subject: [PATCH 042/119] Merge INPUT_CHECK and ERROR_REPORT into one module --- bin/error_report.py | 41 ------------------------------ bin/input_check.py | 40 +++++++++++++++++++---------- modules/local/error_report/main.nf | 29 --------------------- modules/local/input_check/main.nf | 12 +++++---- workflows/gas_nomenclature.nf | 4 --- 5 files changed, 34 insertions(+), 92 deletions(-) delete mode 100755 bin/error_report.py delete mode 100644 modules/local/error_report/main.nf diff --git a/bin/error_report.py b/bin/error_report.py deleted file mode 100755 index d5d3e82..0000000 --- a/bin/error_report.py +++ /dev/null @@ -1,41 +0,0 @@ -#!/usr/bin/env python - -import json -import argparse -import csv - - -def check_inputs(json_file, sample_id, address, output_file): - # Define a variable to store the match_status (True or False) - json_data = json.load(open(json_file)) - match_status = sample_id in json.load(open(json_file)) - - # Define error message based on address (query or reference) - if address == "null": - error_message = f"Query {sample_id} removed from pipeline" - else: - error_message = f"Pipeline stopped: Reference {sample_id}'s input ID and MLST JSON file key DO NOT MATCH" - - # Write match status to error report CSV - if not match_status: - with open(output_file, "w", newline="") as f: - writer = csv.writer(f) - writer.writerow(["sample", "JSON_key", "error_message"]) - writer.writerow([sample_id, list(json_data.keys())[0], error_message]) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Check sample inputs") - parser.add_argument("--input", help="Missing mlst.json file path", required=True) - parser.add_argument( - "--sample_id", help="Missing sample meta.id path", required=True - ) - parser.add_argument( - "--address", help="Missing sample meta.address path", required=True - ) - parser.add_argument( - "--output", help="Requires an error report file path", required=True - ) - args = parser.parse_args() - - check_inputs(args.input, args.sample_id, args.address, args.output) diff --git a/bin/input_check.py b/bin/input_check.py index ec8b8d9..f5e0775 100755 --- a/bin/input_check.py +++ b/bin/input_check.py @@ -3,26 +3,40 @@ import json import argparse import sys +import csv -def check_inputs(json_file, sample_id, output_file): - # Define a variable to store the match status - match_status = sample_id in json.load(open(json_file)) +def check_inputs(json_file, sample_id, address, output_match_file, output_error_file): + # Define a variable to store the match_status (True or False) + with open(json_file, 'r') as f: + json_data = json.load(f) + match_status = sample_id in json_data # Write match status to file - with open(output_file, "w") as f: + with open(output_match_file, "w") as f: f.write(str(match_status)) + # Define error message based on meta.address (query or reference) + if address == "null": + error_message = f"Query {sample_id} removed from pipeline" + else: + error_message = f"Pipeline stopped: Reference {sample_id}'s input ID and MLST JSON file key DO NOT MATCH" + + # Write sample ID and JSON key to error report CSV if not matched; include error message + if not match_status: + with open(output_error_file, "w", newline="") as f: + writer = csv.writer(f) + writer.writerow(["sample", "JSON_key", "error_message"]) + writer.writerow([sample_id, list(json_data.keys())[0], error_message]) if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Check sample inputs") - parser.add_argument("--input", help="Missing mlst.json file path", required=True) - parser.add_argument( - "--sample_id", help="Missing sample meta.id path", required=True - ) - parser.add_argument( - "--output", help="Missing match_status file path", required=True - ) + parser = argparse.ArgumentParser(description="Check sample inputs and generate an error report.") + parser.add_argument("--input", help="Path to the mlst.json file.", required=True) + parser.add_argument("--sample_id", help="Sample ID to check in the JSON file.", required=True) + parser.add_argument("--address", help="Address to use in the error message.", required=True) + parser.add_argument("--output_error", help="Path to the error report file.", required=True) + parser.add_argument("--output_match", help="Path to the match status file.", required=True) + args = parser.parse_args() - check_inputs(args.input, args.sample_id, args.output) + check_inputs(args.input, args.sample_id, args.address, args.output_match, args.output_error) diff --git a/modules/local/error_report/main.nf b/modules/local/error_report/main.nf deleted file mode 100644 index f1af016..0000000 --- a/modules/local/error_report/main.nf +++ /dev/null @@ -1,29 +0,0 @@ -process ERROR_REPORT { - tag "Generates Error Report" - label 'process_single' - - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/python:3.8.3' : - 'biocontainers/python:3.8.3' }" - - input: - tuple val(meta), path(mlst) - - output: - tuple val(meta), path("*_error_report.csv"), optional: true, emit: error_report - path("versions.yml"), emit: versions - - script: - """ - error_report.py \\ - --input ${mlst} \\ - --sample_id ${meta.id} \\ - --address ${meta.address} \\ - --output ${meta.id}_error_report.csv - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - python: \$(python --version | sed 's/Python //g') - END_VERSIONS - """ -} diff --git a/modules/local/input_check/main.nf b/modules/local/input_check/main.nf index 94d9121..79a2242 100644 --- a/modules/local/input_check/main.nf +++ b/modules/local/input_check/main.nf @@ -1,5 +1,5 @@ process INPUT_CHECK{ - tag "Check Sample Inputs" + tag "Check Sample Inputs and Generate Error Report" label 'process_single' container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? @@ -10,8 +10,9 @@ process INPUT_CHECK{ tuple val(meta), path(mlst) output: - tuple val(meta), path("${meta.id}_match.txt"), path(mlst), emit: match - path("versions.yml"), emit: versions + tuple val(meta), path("${meta.id}_match.txt"), path(mlst), emit: match + tuple val(meta), path("*_error_report.csv"), optional: true, emit: error_report + path("versions.yml"), emit: versions script: @@ -19,12 +20,13 @@ process INPUT_CHECK{ input_check.py \\ --input ${mlst} \\ --sample_id ${meta.id} \\ - --output ${meta.id}_match.txt + --address ${meta.address} \\ + --output_error ${meta.id}_error_report.csv \\ + --output_match ${meta.id}_match.txt cat <<-END_VERSIONS > versions.yml "${task.process}": python: \$(python --version | sed 's/Python //g') END_VERSIONS """ - } diff --git a/workflows/gas_nomenclature.nf b/workflows/gas_nomenclature.nf index f5ce43c..4d95639 100644 --- a/workflows/gas_nomenclature.nf +++ b/workflows/gas_nomenclature.nf @@ -23,7 +23,6 @@ include { paramsSummaryLog; paramsSummaryMap; fromSamplesheet } from 'plugin/nf // include { INPUT_CHECK } from "../modules/local/input_check/main" -include { ERROR_REPORT } from "../modules/local/error_report/main" include { SAMPLE_FILTER } from "../modules/local/sample_filter/main" include { LOCIDEX_MERGE as LOCIDEX_MERGE_REF } from "../modules/local/locidex/merge/main" include { LOCIDEX_MERGE as LOCIDEX_MERGE_QUERY } from "../modules/local/locidex/merge/main" @@ -77,9 +76,6 @@ workflow GAS_NOMENCLATURE { id_key = INPUT_CHECK(input) ch_versions = ch_versions.mix(id_key.versions) - error_report = ERROR_REPORT(input) - ch_versions = ch_versions.mix(error_report.versions) - // Update metadata to include the id_key.match data match = id_key.match.map { meta, file, json -> def id_match = file.text.trim() From 11f7be98be5b5db1d7848fa0f755bbddd994cc92 Mon Sep 17 00:00:00 2001 From: kylacochrane Date: Thu, 16 May 2024 17:20:14 -0400 Subject: [PATCH 043/119] Fixed input_check.py --- bin/input_check.py | 29 +++++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/bin/input_check.py b/bin/input_check.py index f5e0775..a21f0c7 100755 --- a/bin/input_check.py +++ b/bin/input_check.py @@ -8,7 +8,7 @@ def check_inputs(json_file, sample_id, address, output_match_file, output_error_file): # Define a variable to store the match_status (True or False) - with open(json_file, 'r') as f: + with open(json_file, "r") as f: json_data = json.load(f) match_status = sample_id in json_data @@ -29,14 +29,27 @@ def check_inputs(json_file, sample_id, address, output_match_file, output_error_ writer.writerow(["sample", "JSON_key", "error_message"]) writer.writerow([sample_id, list(json_data.keys())[0], error_message]) + if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Check sample inputs and generate an error report.") + parser = argparse.ArgumentParser( + description="Check sample inputs and generate an error report." + ) parser.add_argument("--input", help="Path to the mlst.json file.", required=True) - parser.add_argument("--sample_id", help="Sample ID to check in the JSON file.", required=True) - parser.add_argument("--address", help="Address to use in the error message.", required=True) - parser.add_argument("--output_error", help="Path to the error report file.", required=True) - parser.add_argument("--output_match", help="Path to the match status file.", required=True) - + parser.add_argument( + "--sample_id", help="Sample ID to check in the JSON file.", required=True + ) + parser.add_argument( + "--address", help="Address to use in the error message.", required=True + ) + parser.add_argument( + "--output_error", help="Path to the error report file.", required=True + ) + parser.add_argument( + "--output_match", help="Path to the match status file.", required=True + ) + args = parser.parse_args() - check_inputs(args.input, args.sample_id, args.address, args.output_match, args.output_error) + check_inputs( + args.input, args.sample_id, args.address, args.output_match, args.output_error + ) From 2f674eccddd0bc9cf3a455a977499ff774b82be9 Mon Sep 17 00:00:00 2001 From: kylacochrane Date: Thu, 16 May 2024 17:28:38 -0400 Subject: [PATCH 044/119] Update iridanext.config --- conf/iridanext.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conf/iridanext.config b/conf/iridanext.config index d0dc84c..ce9ad72 100644 --- a/conf/iridanext.config +++ b/conf/iridanext.config @@ -4,7 +4,7 @@ iridanext { path = "${params.outdir}/iridanext.output.json.gz" overwrite = true files { - samples = ["**/error/*_error_report.csv"] + samples = ["**/input/*_error_report.csv"] } metadata { samples { From 5456aa7050dbb7d71387f7a70d2da302b9f5635f Mon Sep 17 00:00:00 2001 From: kylacochrane Date: Fri, 17 May 2024 11:12:14 -0400 Subject: [PATCH 045/119] Updated workflow to filter sample inputs and removed corresponding module/process --- bin/filter_samples.py | 52 ------------------- modules/local/sample_filter/main.nf | 30 ----------- .../templates/dumpsoftwareversions.py | 4 +- workflows/gas_nomenclature.nf | 17 +++--- 4 files changed, 13 insertions(+), 90 deletions(-) delete mode 100755 bin/filter_samples.py delete mode 100644 modules/local/sample_filter/main.nf diff --git a/bin/filter_samples.py b/bin/filter_samples.py deleted file mode 100755 index 11017c2..0000000 --- a/bin/filter_samples.py +++ /dev/null @@ -1,52 +0,0 @@ -#!/usr/bin/env python - -import argparse -import json -import os -import sys - - -def process_input(id, address, id_match, input_file, output_file): - try: - # Load JSON data from input file - with open(input_file, "r") as json_file: - data = json.load(json_file) - - if id_match == "True": - print("ID match is True. Outputting the same tuple.", file=sys.stdout) - with open(output_file, "w") as output: - json.dump(data, output) - elif address == "null" and id_match == "False": - print("Query sample removed from analysis.", file=sys.stdout) - # Remove the input file to indicate this sample should be excluded - os.remove(input_file) - elif id_match == "False": - print( - "Pipeline stopped: Reference sample ID and MLST JSON file key DO NOT MATCH.", - file=sys.stderr, - ) - sys.exit(1) - else: - print("Unhandled case in input conditions.", file=sys.stderr) - sys.exit(1) - except Exception as e: - print(f"An error occurred: {e}", file=sys.stderr) - sys.exit(1) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Process input tuple.") - parser.add_argument("--id", type=str, required=True, help="Sample ID") - parser.add_argument("--address", type=str, required=True, help="Cluster Address") - parser.add_argument("--id_match", type=str, required=True, help="ID Match Boolean") - parser.add_argument( - "--input", type=str, required=True, help="Path to the input JSON file" - ) - parser.add_argument( - "--output", type=str, required=True, help="Path to the output file" - ) - - args = parser.parse_args() - - # Process input - process_input(args.id, args.address, args.id_match, args.input, args.output) diff --git a/modules/local/sample_filter/main.nf b/modules/local/sample_filter/main.nf deleted file mode 100644 index 7bdd35e..0000000 --- a/modules/local/sample_filter/main.nf +++ /dev/null @@ -1,30 +0,0 @@ -process SAMPLE_FILTER { - tag "Filter Samples based on Metadata Conditions" - label 'process_single' - - container "${workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/python:3.8.3' : - 'biocontainers/python:3.8.3'}" - - input: - tuple val(meta), path(mlst) - - output: - tuple val(meta), path("${meta.id}.mlst.json"), optional: true, emit: out - path("versions.yml"), emit: versions - - script: - """ - filter_samples.py \\ - --id ${meta.id} \\ - --address ${meta.address} \\ - --id_match ${meta.id_match} \\ - --input ${mlst} \\ - --output ${meta.id}.mlst.json - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - python: \$(python --version | sed 's/Python //g') - END_VERSIONS - """ -} diff --git a/modules/nf-core/custom/dumpsoftwareversions/templates/dumpsoftwareversions.py b/modules/nf-core/custom/dumpsoftwareversions/templates/dumpsoftwareversions.py index da03340..4a99360 100755 --- a/modules/nf-core/custom/dumpsoftwareversions/templates/dumpsoftwareversions.py +++ b/modules/nf-core/custom/dumpsoftwareversions/templates/dumpsoftwareversions.py @@ -58,7 +58,9 @@ def main(): } with open("$versions") as f: - versions_by_process = yaml.load(f, Loader=yaml.BaseLoader) | versions_this_module + versions_by_process = ( + yaml.load(f, Loader=yaml.BaseLoader) | versions_this_module + ) # aggregate versions by the module name (derived from fully-qualified process name) versions_by_module = {} diff --git a/workflows/gas_nomenclature.nf b/workflows/gas_nomenclature.nf index 4d95639..e023d6a 100644 --- a/workflows/gas_nomenclature.nf +++ b/workflows/gas_nomenclature.nf @@ -23,7 +23,6 @@ include { paramsSummaryLog; paramsSummaryMap; fromSamplesheet } from 'plugin/nf // include { INPUT_CHECK } from "../modules/local/input_check/main" -include { SAMPLE_FILTER } from "../modules/local/sample_filter/main" include { LOCIDEX_MERGE as LOCIDEX_MERGE_REF } from "../modules/local/locidex/merge/main" include { LOCIDEX_MERGE as LOCIDEX_MERGE_QUERY } from "../modules/local/locidex/merge/main" include { PROFILE_DISTS } from "../modules/local/profile_dists/main" @@ -83,16 +82,20 @@ workflow GAS_NOMENCLATURE { } // If samples have a disparity between meta.id and JSON key: Exclude the queried samples OR halt the pipeline with an error if sample has an associated cluster address (reference) - filtered = SAMPLE_FILTER(match) - ch_versions = ch_versions.mix(filtered.versions) - - new_input = filtered.out + new_input = match.filter { meta, json -> + if (meta.id_match == 'True') { + return true // Keep the sample + } else if (meta.address == null && meta.id_match == 'False') { + return false // Remove the sample + } else if (meta.address != null && meta.id_match == 'False') { + // Exit with error statement + throw new RuntimeException("Pipeline exiting: sample with ID ${meta.id} does not have matching MLST JSON file.") + } + } // Prepare reference and query TSV files for LOCIDEX_MERGE profiles = new_input.branch{ - ref: it[0].address query: !it[0].address - errors: true // To discuss, add in check on file for erroneous values, may not be needed as nf-validation is working } reference_values = input.collect{ meta, profile -> profile} query_values = profiles.query.collect{ meta, profile -> profile } From 40211de629dc2846eb89617e9ba3c296e0cae24f Mon Sep 17 00:00:00 2001 From: kylacochrane Date: Fri, 17 May 2024 12:17:34 -0400 Subject: [PATCH 046/119] Update main.nf.test to align with changes from input-check branch --- tests/data/irida/test2_iridanext.output.json | 2 +- tests/pipelines/main.nf.test | 16 +++++++--------- 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/tests/data/irida/test2_iridanext.output.json b/tests/data/irida/test2_iridanext.output.json index 5d0c533..2882954 100644 --- a/tests/data/irida/test2_iridanext.output.json +++ b/tests/data/irida/test2_iridanext.output.json @@ -4,7 +4,7 @@ "samples": { "sampleR": [ { - "path": "error/sampleR_error_report.csv" + "path": "input/sampleR_error_report.csv" } ] } diff --git a/tests/pipelines/main.nf.test b/tests/pipelines/main.nf.test index eda1da6..5d3b94d 100644 --- a/tests/pipelines/main.nf.test +++ b/tests/pipelines/main.nf.test @@ -37,6 +37,7 @@ nextflow_pipeline { assert path("$launchDir/results/iridanext.output.json").json == path("$baseDir/tests/data/irida/test_iridanext.output.json").json def iridanext_json = path("$launchDir/results/iridanext.output.json").json + def iridanext_samples = iridanext_json.files.samples def iridanext_metadata = iridanext_json.metadata.samples assert iridanext_metadata.sampleQ."address" == "1.1.3" @@ -55,19 +56,16 @@ nextflow_pipeline { then { assert workflow.failed - assert (workflow.stdout =~ /Pipeline stopped: Reference sample ID and MLST JSON file key DO NOT MATCH/).find() + assert (workflow.stdout =~ /Pipeline exiting: sample with ID sample2 does not have matching MLST JSON file./).find() assert path("$launchDir/results").exists() - assert path("$launchDir/results/error").exists() + assert path("$launchDir/results/input").exists() // Ensure that despite pipeline failure, error_reports are generated for all samples added to pipeline (i.e. sampleQ query) def lines = [] - lines = path("$launchDir/results/error/sample2_error_report.csv").readLines() + lines = path("$launchDir/results/input/sample2_error_report.csv").readLines() assert lines.contains("sample2,sample7,Pipeline stopped: Reference sample2's input ID and MLST JSON file key DO NOT MATCH") - - lines = path("$launchDir/results/error/sampleQ_error_report.csv").readLines() - assert lines.contains("sampleQ,sampleF,Query sampleQ removed from pipeline") } } @@ -84,14 +82,14 @@ nextflow_pipeline { then { assert workflow.success assert path("$launchDir/results").exists() - assert path("$launchDir/results/error").exists() + assert path("$launchDir/results/input").exists() assert path("$launchDir/results/filter").exists() // Check outputs def lines = [] // Ensure that the error_report is generated for removed query sampleR - lines = path("$launchDir/results/error/sampleR_error_report.csv").readLines() + lines = path("$launchDir/results/input/sampleR_error_report.csv").readLines() assert lines.contains("sampleR,sampleF,Query sampleR removed from pipeline") // Check query output csv @@ -105,7 +103,7 @@ nextflow_pipeline { def iridanext_samples = iridanext_json.files.samples def iridanext_metadata = iridanext_json.metadata.samples - assert iridanext_samples.sampleR.findAll { it.path == "error/sampleR_error_report.csv" }.size() == 1 + assert iridanext_samples.sampleR.findAll { it.path == "input/sampleR_error_report.csv" }.size() == 1 assert iridanext_metadata.sampleQ."address" == "1.1.3" } } From 7feaee8f6c38a61ca6ef8d4916b808aff9f4645e Mon Sep 17 00:00:00 2001 From: kylacochrane Date: Fri, 17 May 2024 16:55:16 -0400 Subject: [PATCH 047/119] Updated main.nf.test --- tests/data/samplesheets/samplesheet_test1.csv | 1 - tests/pipelines/main.nf.test | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/data/samplesheets/samplesheet_test1.csv b/tests/data/samplesheets/samplesheet_test1.csv index 094ed0f..5fcc39b 100644 --- a/tests/data/samplesheets/samplesheet_test1.csv +++ b/tests/data/samplesheets/samplesheet_test1.csv @@ -3,4 +3,3 @@ sampleQ,https://raw.githubusercontent.com/phac-nml/gasnomenclature/add_tests/tes sample1,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample1.mlst.json,1.1.1 sample2,https://raw.githubusercontent.com/phac-nml/gasnomenclature/add_tests/tests/data/reports/sample7.mlst.json,1.1.1 sample3,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample3.mlst.json,1.1.2 - diff --git a/tests/pipelines/main.nf.test b/tests/pipelines/main.nf.test index 5d3b94d..930cf22 100644 --- a/tests/pipelines/main.nf.test +++ b/tests/pipelines/main.nf.test @@ -40,6 +40,7 @@ nextflow_pipeline { def iridanext_samples = iridanext_json.files.samples def iridanext_metadata = iridanext_json.metadata.samples + assert iridanext_metadata.size() == 1 && iridanext_metadata.containsKey("sampleQ") assert iridanext_metadata.sampleQ."address" == "1.1.3" } } From 4589456a2cc7117adf54b95831a80925bd653d11 Mon Sep 17 00:00:00 2001 From: kylacochrane Date: Tue, 21 May 2024 09:38:55 -0400 Subject: [PATCH 048/119] Reverted changes to nf-core module --- .../dumpsoftwareversions/templates/dumpsoftwareversions.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/modules/nf-core/custom/dumpsoftwareversions/templates/dumpsoftwareversions.py b/modules/nf-core/custom/dumpsoftwareversions/templates/dumpsoftwareversions.py index 4a99360..da03340 100755 --- a/modules/nf-core/custom/dumpsoftwareversions/templates/dumpsoftwareversions.py +++ b/modules/nf-core/custom/dumpsoftwareversions/templates/dumpsoftwareversions.py @@ -58,9 +58,7 @@ def main(): } with open("$versions") as f: - versions_by_process = ( - yaml.load(f, Loader=yaml.BaseLoader) | versions_this_module - ) + versions_by_process = yaml.load(f, Loader=yaml.BaseLoader) | versions_this_module # aggregate versions by the module name (derived from fully-qualified process name) versions_by_module = {} From 97482872ab05664a0235709aa7962ee8e658edb2 Mon Sep 17 00:00:00 2001 From: kylacochrane Date: Tue, 21 May 2024 10:00:34 -0400 Subject: [PATCH 049/119] Improvement to workflow --- workflows/gas_nomenclature.nf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/workflows/gas_nomenclature.nf b/workflows/gas_nomenclature.nf index e023d6a..80518d9 100644 --- a/workflows/gas_nomenclature.nf +++ b/workflows/gas_nomenclature.nf @@ -78,12 +78,12 @@ workflow GAS_NOMENCLATURE { // Update metadata to include the id_key.match data match = id_key.match.map { meta, file, json -> def id_match = file.text.trim() - [meta + [id_match: id_match], json] + [meta + [id_match: id_match == 'True'], json] } // If samples have a disparity between meta.id and JSON key: Exclude the queried samples OR halt the pipeline with an error if sample has an associated cluster address (reference) new_input = match.filter { meta, json -> - if (meta.id_match == 'True') { + if (meta.id_match) { return true // Keep the sample } else if (meta.address == null && meta.id_match == 'False') { return false // Remove the sample From 131af6fc52b8cf2c6fb51426e4eedf0b4731ade8 Mon Sep 17 00:00:00 2001 From: kylacochrane Date: Tue, 21 May 2024 11:54:46 -0400 Subject: [PATCH 050/119] Updated workflow --- workflows/gas_nomenclature.nf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/workflows/gas_nomenclature.nf b/workflows/gas_nomenclature.nf index 80518d9..b0fb977 100644 --- a/workflows/gas_nomenclature.nf +++ b/workflows/gas_nomenclature.nf @@ -85,9 +85,9 @@ workflow GAS_NOMENCLATURE { new_input = match.filter { meta, json -> if (meta.id_match) { return true // Keep the sample - } else if (meta.address == null && meta.id_match == 'False') { + } else if (meta.address == null && !meta.id_match) { return false // Remove the sample - } else if (meta.address != null && meta.id_match == 'False') { + } else if (meta.address != null && !meta.id_match) { // Exit with error statement throw new RuntimeException("Pipeline exiting: sample with ID ${meta.id} does not have matching MLST JSON file.") } From b1c888a87bc890b8371ddcc55ebce15232123434 Mon Sep 17 00:00:00 2001 From: kylacochrane Date: Wed, 22 May 2024 13:53:35 -0400 Subject: [PATCH 051/119] Update URLs in test samplesheets --- tests/data/samplesheets/samplesheet_test1.csv | 4 ++-- tests/data/samplesheets/samplesheet_test2.csv | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/data/samplesheets/samplesheet_test1.csv b/tests/data/samplesheets/samplesheet_test1.csv index 5fcc39b..cf87b26 100644 --- a/tests/data/samplesheets/samplesheet_test1.csv +++ b/tests/data/samplesheets/samplesheet_test1.csv @@ -1,5 +1,5 @@ sample,mlst_alleles,address -sampleQ,https://raw.githubusercontent.com/phac-nml/gasnomenclature/add_tests/tests/data/reports/sampleF.mlst.json, +sampleQ,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sampleF.mlst.json, sample1,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample1.mlst.json,1.1.1 -sample2,https://raw.githubusercontent.com/phac-nml/gasnomenclature/add_tests/tests/data/reports/sample7.mlst.json,1.1.1 +sample2,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample7.mlst.json,1.1.1 sample3,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample3.mlst.json,1.1.2 diff --git a/tests/data/samplesheets/samplesheet_test2.csv b/tests/data/samplesheets/samplesheet_test2.csv index ef952c3..036c317 100644 --- a/tests/data/samplesheets/samplesheet_test2.csv +++ b/tests/data/samplesheets/samplesheet_test2.csv @@ -1,5 +1,5 @@ sample,mlst_alleles,address -sampleR,https://raw.githubusercontent.com/phac-nml/gasnomenclature/add_tests/tests/data/reports/sampleF.mlst.json, +sampleR,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sampleF.mlst.json, sampleQ,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sampleQ.mlst.json, sample1,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample1.mlst.json,1.1.1 sample2,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample2.mlst.json,1.1.1 From 928ca15a750b7311e1e40f6ba35bceadbdd222fa Mon Sep 17 00:00:00 2001 From: kylacochrane Date: Mon, 27 May 2024 15:15:34 -0400 Subject: [PATCH 052/119] Add a module to partition reference sample addresses into hierarchical levels for the gas-call process --- modules/local/cluster_file/main.nf | 36 ++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) create mode 100644 modules/local/cluster_file/main.nf diff --git a/modules/local/cluster_file/main.nf b/modules/local/cluster_file/main.nf new file mode 100644 index 0000000..3ed44e8 --- /dev/null +++ b/modules/local/cluster_file/main.nf @@ -0,0 +1,36 @@ +process CLUSTER_FILE { + tag "Create cluster file for GAS call" + label 'process_single' + + input: + val meta + + output: + path("expected_clusters.txt"), emit: text + + exec: + def outputLines = [] + + // Determine the maximum number of levels to set the header requirements for each pipeline run + int maxLevels = meta.collect { sample -> sample.address.split("\\.").size() }.max() ?: 0 + + // Generate the header + def header = ["id", "address"] + (1..maxLevels).collect { "level_$it" } + outputLines << header.join("\t") + + // Iterate over each sample in the meta list and pull the relevant information for the text file + meta.each { sample -> + def id = sample.id + def address = sample.address + def levels = address.split("\\.") + def line = [id, address] + levels.collect { it.toString() } + (levels.size().. + outputLines.each { line -> + writer.writeLine(line) + } + } +} From d2f8a0b6ddcabf2362f53ab20978be7075078e07 Mon Sep 17 00:00:00 2001 From: kylacochrane Date: Mon, 27 May 2024 15:16:02 -0400 Subject: [PATCH 053/119] Update workflow to generated expected_clusters file --- workflows/gas_nomenclature.nf | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/workflows/gas_nomenclature.nf b/workflows/gas_nomenclature.nf index b0fb977..b20be9a 100644 --- a/workflows/gas_nomenclature.nf +++ b/workflows/gas_nomenclature.nf @@ -26,6 +26,7 @@ include { INPUT_CHECK } from "../modules/local/input_c include { LOCIDEX_MERGE as LOCIDEX_MERGE_REF } from "../modules/local/locidex/merge/main" include { LOCIDEX_MERGE as LOCIDEX_MERGE_QUERY } from "../modules/local/locidex/merge/main" include { PROFILE_DISTS } from "../modules/local/profile_dists/main" +include { CLUSTER_FILE } from "../modules/local/cluster_file/main" include { GAS_CALL } from "../modules/local/gas/call/main" include { FILTER_QUERY } from "../modules/local/filter_query/main" @@ -131,10 +132,16 @@ workflow GAS_NOMENCLATURE { columns_file) ch_versions = ch_versions.mix(distances.versions) - // GAS CALL - clusters = Channel.fromPath(params.ref_clusters, checkIfExists: true) + // Generate the expected_cluster file from the reference sample provided addresses + clusters = input.filter { meta, file -> + meta.address != null + }.collect { meta, file -> + meta } + + expected_clusters = CLUSTER_FILE(clusters) - called_data = GAS_CALL(clusters, distances.results) + // GAS CALL + called_data = GAS_CALL(expected_clusters.text, distances.results) ch_versions = ch_versions.mix(called_data.versions) // Filter the new queried samples and addresses into a CSV/JSON file for the IRIDANext plug in From 2320029dcb137c4e55d12f3a20d324c1f954bcb6 Mon Sep 17 00:00:00 2001 From: kylacochrane Date: Mon, 27 May 2024 15:16:19 -0400 Subject: [PATCH 054/119] Update test.config --- conf/test.config | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/conf/test.config b/conf/test.config index 9ba7cf1..dee168d 100644 --- a/conf/test.config +++ b/conf/test.config @@ -20,8 +20,7 @@ params { max_time = '1.h' // Input data - input = "${projectDir}/tests/data/samplesheets/samplesheet1.csv" - ref_clusters = 'https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/clusters/expected_clusters.txt' + input = "${projectDir}/assets/samplesheet.csv" } From fef02bc11df2679f80be483a99e58c267855f390 Mon Sep 17 00:00:00 2001 From: kylacochrane Date: Mon, 27 May 2024 15:18:26 -0400 Subject: [PATCH 055/119] Update gas-call expected_results.txt following modification of sample 3 address in expected_clusters file --- tests/data/called/expected_results.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/data/called/expected_results.txt b/tests/data/called/expected_results.txt index 0d30de2..1e530e7 100644 --- a/tests/data/called/expected_results.txt +++ b/tests/data/called/expected_results.txt @@ -1,5 +1,5 @@ id address level_1 level_2 level_3 sample1 1.1.1 1 1 1 sample2 1.1.1 1 1 1 -sample3 2.2.2 2 2 2 +sample3 1.1.2 1 1 2 sampleQ 1.1.3 1 1 3 From 28c098a8472c247d7163b023bb57137dee60821c Mon Sep 17 00:00:00 2001 From: kylacochrane Date: Mon, 27 May 2024 15:47:51 -0400 Subject: [PATCH 056/119] Fixed typo --- workflows/gas_nomenclature.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/gas_nomenclature.nf b/workflows/gas_nomenclature.nf index b20be9a..4dd9d5f 100644 --- a/workflows/gas_nomenclature.nf +++ b/workflows/gas_nomenclature.nf @@ -132,7 +132,7 @@ workflow GAS_NOMENCLATURE { columns_file) ch_versions = ch_versions.mix(distances.versions) - // Generate the expected_cluster file from the reference sample provided addresses + // Generate the expected_clusters.txt file from the addresses of the provided reference samples clusters = input.filter { meta, file -> meta.address != null }.collect { meta, file -> From de705f610b42c5e297775784ac850d19f037b62f Mon Sep 17 00:00:00 2001 From: kylacochrane Date: Mon, 27 May 2024 16:01:13 -0400 Subject: [PATCH 057/119] Update parameters for profile_dists and gas_call; Add/update descriptions of various parameters for clarity --- assets/schema_input.json | 4 +-- nextflow.config | 7 +--- nextflow_schema.json | 75 +++++++++++++++++++--------------------- 3 files changed, 39 insertions(+), 47 deletions(-) diff --git a/assets/schema_input.json b/assets/schema_input.json index 48e9936..6094f92 100644 --- a/assets/schema_input.json +++ b/assets/schema_input.json @@ -17,8 +17,8 @@ "mlst_alleles": { "type": "string", "format": "file-path", - "pattern": "^\\S+\\.mlst\\.json(\\.gz)?$", - "errorMessage": "MLST JSON file from locidex report, cannot contain spaces and must have the extension: '.mlst.json' or '.mlst.json.gz'" + "pattern": "^\\S+\\.mlst(\\.subtyping)?\\.json(\\.gz)?$", + "errorMessage": "MLST JSON file from locidex report, cannot contain spaces and must have the extension: '.mlst.json', '.mlst.json.gz', '.mlst.subtyping.json', or 'mlst.subtyping.json.gz'" }, "address": { "type": "string", diff --git a/nextflow.config b/nextflow.config index 9f9d8ff..227b0bb 100644 --- a/nextflow.config +++ b/nextflow.config @@ -11,9 +11,6 @@ params { // Input options input = null - project_name = 'assembly' - assembler = 'stub' - random_seed = 1 // Boilerplate options outdir = null @@ -51,19 +48,17 @@ params { pd_distm = "hamming" pd_missing_threshold = 1.0 pd_sample_quality_threshold = 1.0 - pd_match_threshold = -1.0 pd_file_type = "text" pd_mapping_file = null // default is no file pd_force = false pd_skip = false pd_columns = null - pd_count_missing = true + pd_count_missing = false // GAS Call gm_thresholds = "10,5,0" gm_delimiter = "'.'" // note the single quotes surrounding the delimiter - ref_clusters = "" } diff --git a/nextflow_schema.json b/nextflow_schema.json index d82d41f..ea67459 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -2,7 +2,7 @@ "$schema": "http://json-schema.org/draft-07/schema", "$id": "https://raw.githubusercontent.com/phac-nml/gasnomenclature/main/nextflow_schema.json", "title": "phac-nml/gasnomenclature pipeline parameters", - "description": "IRIDA Next Example Pipeline", + "description": "Gas Nomenclature assignment pipeline", "type": "object", "definitions": { "gas_call": { @@ -13,14 +13,15 @@ "properties": { "gm_thresholds": { "type": "string", - "default": "10,5,0" + "default": "10,5,0", + "description": "Thresholds delimited by ','. Values should match units from '--pd_distm' (either 'hamming' or 'scaled').", + "pattern": "^(\\d+(\\.\\d+)?,)*\\d+(\\.\\d+)?$" }, "gm_delimiter": { "type": "string", - "default": "\\'.\\" - }, - "ref_clusters": { - "type": "string" + "default": "\\'.\\", + "description": "Delimiter desired for nomenclature code.", + "pattern": "^\\S+$" } } }, @@ -32,43 +33,60 @@ "properties": { "pd_outfmt": { "type": "string", - "default": "pairwise" + "description": "The output format for distances", + "enum": ["pairwise"], + "default": "pairwise", + "hidden": true }, "pd_distm": { "type": "string", + "description": "The distance method/unit", + "enum": ["hamming", "scaled"], "default": "hamming" }, "pd_missing_threshold": { "type": "number", + "description": "The maximum proportion of missing data per locus for a locus to be kept in the analysis", + "minimum": 0, + "maximum": 1, "default": 1 }, "pd_sample_quality_threshold": { "type": "number", + "description": "The maximum proportion of missing data per sample for a sample to be kept in the analysis", + "minimum": 0, + "maximum": 1, "default": 1 }, - "pd_match_threshold": { - "type": "number", - "default": -1 - }, "pd_file_type": { "type": "string", + "description": "Output format file type", + "enum": ["text", "parquet"], "default": "text" }, "pd_mapping_file": { - "type": "string" - }, - "pd_force": { - "type": "boolean" + "type": "string", + "pattern": "^\\S+\\.json(\\.gz)?$", + "description": "A file used to map allele codes to integers for internal distance calculations", + "exists": true, + "hidden": true, + "format": "file-path" }, "pd_skip": { - "type": "boolean" + "type": "boolean", + "description": "Skip QA/QC steps" }, "pd_columns": { - "type": "string" + "type": "string", + "pattern": "^\\S+$", + "description": "Defines the loci to keep within the analysis. Formatted as a single column file with one locus name per line or list of comma-separated loci", + "exists": true, + "format": "file-path" }, "pd_count_missing": { "type": "boolean", - "default": true + "description": "Count missing alleles as different", + "default": false } } }, @@ -96,27 +114,6 @@ "description": "The output directory where the results will be saved. You have to use absolute paths to storage on Cloud infrastructure.", "fa_icon": "fas fa-folder-open" }, - "project_name": { - "type": "string", - "default": "assembly", - "pattern": "^\\S+$", - "description": "The name of the project.", - "fa_icon": "fas fa-tag" - }, - "assembler": { - "type": "string", - "default": "stub", - "fa_icon": "fas fa-desktop", - "description": "The sequence assembler to use for sequence assembly.", - "enum": ["default", "stub", "experimental"] - }, - "random_seed": { - "type": "integer", - "default": 1, - "fa_icon": "fas fa-dice-six", - "description": "The random seed to use for sequence assembly.", - "minimum": 1 - }, "email": { "type": "string", "description": "Email address for completion summary.", From 76c69808f20ef7e7c01b1b0d9b05ec44c508b4f0 Mon Sep 17 00:00:00 2001 From: kylacochrane Date: Mon, 27 May 2024 16:02:47 -0400 Subject: [PATCH 058/119] Remove pd_force from nextflow.config --- nextflow.config | 1 - 1 file changed, 1 deletion(-) diff --git a/nextflow.config b/nextflow.config index 227b0bb..a90df40 100644 --- a/nextflow.config +++ b/nextflow.config @@ -50,7 +50,6 @@ params { pd_sample_quality_threshold = 1.0 pd_file_type = "text" pd_mapping_file = null // default is no file - pd_force = false pd_skip = false pd_columns = null pd_count_missing = false From e9e14f057b3cd7589c1fc5c897201fd397e6ff6c Mon Sep 17 00:00:00 2001 From: kylacochrane Date: Mon, 27 May 2024 16:07:12 -0400 Subject: [PATCH 059/119] Removed pd_force argument from profile_dists/main.nf --- modules/local/profile_dists/main.nf | 3 --- 1 file changed, 3 deletions(-) diff --git a/modules/local/profile_dists/main.nf b/modules/local/profile_dists/main.nf index b7a0933..3d6845c 100644 --- a/modules/local/profile_dists/main.nf +++ b/modules/local/profile_dists/main.nf @@ -32,9 +32,6 @@ process PROFILE_DISTS{ if(columns){ args = args + " --columns $columns" } - if(params.pd_force){ - args = args + " --force" - } if(params.pd_skip){ args = args + " --skip" } From 43a0b41bd2ee019b268b1b71f0f0091ba1f0d5a9 Mon Sep 17 00:00:00 2001 From: kylacochrane Date: Mon, 27 May 2024 16:56:53 -0400 Subject: [PATCH 060/119] update documentation --- README.md | 84 ++++++++++++++++++++++++++++---------------------- docs/output.md | 69 ++++++++++++++++++++++++++--------------- docs/usage.md | 24 +++++++-------- 3 files changed, 104 insertions(+), 73 deletions(-) diff --git a/README.md b/README.md index 303bd5f..0dd80be 100644 --- a/README.md +++ b/README.md @@ -1,23 +1,58 @@ [![Nextflow](https://img.shields.io/badge/nextflow-%E2%89%A523.04.3-brightgreen.svg)](https://www.nextflow.io/) -# Example Pipeline for IRIDA Next +# Genomic Address Service Nomenclature Workflow -This is an example pipeline to be used for integration with IRIDA Next. +This workflow takes provided JSON-formatted MLST allelic profiles and assigns cluster addresses to samples based on an existing cluster designations. This pipeline is designed to be integrated into IRIDA Next. However, it may be run as a stand-alone pipeline. + +A brief overview of the usage of this pipeline is given below. Detailed documentation can be found in the [docs/](docs/) directory. # Input The input to the pipeline is a standard sample sheet (passed as `--input samplesheet.csv`) that looks like: -| sample | fastq_1 | fastq_2 | -| ------- | --------------- | --------------- | -| SampleA | file_1.fastq.gz | file_2.fastq.gz | +| sample | mlst_alleles | address | +| ------- | ----------------- | ------- | +| sampleA | sampleA.mlst.json | 1.1.1 | +| sampleQ | sampleQ.mlst.json | | +| sampleF | sampleF.mlst.json | | The structure of this file is defined in [assets/schema_input.json](assets/schema_input.json). Validation of the sample sheet is performed by [nf-validation](https://nextflow-io.github.io/nf-validation/). +Details on the columns can be found in the [Full samplesheet](docs/usage.md#full-samplesheet) documentation. + # Parameters The main parameters are `--input` as defined above and `--output` for specifying the output results directory. You may wish to provide `-profile singularity` to specify the use of singularity containers and `-r [branch]` to specify which GitHub branch you would like to run. +## Profile dists + +The following can be used to adjust parameters for the [profile_dists][] tool. + +- `--pd_outfmt`: The output format for distances. For this pipeline the only valid value is _pairwise_ (required by [gas call][]). +- `--pd_distm`: The distance method/unit, either _hamming_ or _scaled_. For _hamming_ distances, the distance values will be a non-negative integer. For _scaled_ distances, the distance values are between 0 and 1. +- `--pd_missing_threshold`: The maximum proportion of missing data per locus for a locus to be kept in the analysis. Values from 0 to 1. +- `--pd_sample_quality_threshold`: The maximum proportion of missing data per sample for a sample to be kept in the analysis. Values from 0 to 1. +- `--pd_file_type`: Output format file type. One of _text_ or _parquet_. +- `--pd_mapping_file`: A file used to map allele codes to integers for internal distance calculations. This is the same file as produced from the _profile dists_ step (the [allele_map.json](docs/output.md#profile-dists) file). Normally, this is unneeded unless you wish to override the automated process of mapping alleles to integers. +- `--pd_skip`: Skip QA/QC steps. Can be used as a flag, `--pd_skip`, or passing a boolean, `--pd_skip true` or `--pd_skip false`. +- `--pd_columns`: Defines the loci to keep within the analysis (default when unset is to keep all loci). Formatted as a single column file with one locus name per line. For example: + - **Single column format** + ``` + loci1 + loci2 + loci3 + ``` +- `--pd_count_missing`: Count missing alleles as different. Can be used as a flag, `--pd_count_missing`, or passing a boolean, `--pd_count_missing true` or `--pd_count_missing false`. If true, will consider missing allele calls for the same locus between samples as a difference, increasing the distance counts. + +## GAS CALL + +The following can be used to adjust parameters for the [gas call][] tool. + +- `--gm_thresholds`: Thresholds delimited by `,`. Values should match units from `--pd_distm` (either _hamming_ or _scaled_). +- `--gm_delimiter`: Delimiter desired for nomenclature code. + +## Other + Other parameters (defaults from nf-core) are defined in [nextflow_schema.json](nextflow_schmea.json). # Running @@ -39,51 +74,26 @@ An example of the what the contents of the IRIDA Next JSON file looks like for t ``` { "files": { - "global": [ - { - "path": "summary/summary.txt.gz" - } - ], + "global": [], "samples": { - "SAMPLE1": [ - { - "path": "assembly/SAMPLE1.assembly.fa.gz" - } - ], - "SAMPLE2": [ + "sampleF": [ { - "path": "assembly/SAMPLE2.assembly.fa.gz" + "path": "input/sampleF_error_report.csv" } ], - "SAMPLE3": [ - { - "path": "assembly/SAMPLE3.assembly.fa.gz" - } - ] } }, "metadata": { "samples": { - "SAMPLE1": { - "reads.1": "sample1_R1.fastq.gz", - "reads.2": "sample1_R2.fastq.gz" - }, - "SAMPLE2": { - "reads.1": "sample2_R1.fastq.gz", - "reads.2": "sample2_R2.fastq.gz" - }, - "SAMPLE3": { - "reads.1": "sample1_R1.fastq.gz", - "reads.2": "null" + "sampleQ": { + "address": "1.1.3", } } } } ``` -Within the `files` section of this JSON file, all of the output paths are relative to the `outdir`. Therefore, `"path": "assembly/SAMPLE1.assembly.fa.gz"` refers to a file located within `outdir/assembly/SAMPLE1.assembly.fa.gz`. - -There is also a pipeline execution summary output file provided (specified in the above JSON as `"global": [{"path":"summary/summary.txt.gz"}]`). However, there is no formatting specification for this file. +Within the `files` section of this JSON file, all of the output paths are relative to the `outdir`. Therefore, `"path": "input/sampleF_error_report.csv"` refers to a file located within `outdir/input/sampleF_error_report.csv`. This file is generated only if a sample fails the input check during samplesheet assessment. ## Test profile @@ -95,7 +105,7 @@ nextflow run phac-nml/gasnomenclature -profile docker,test -r main -latest --out # Legal -Copyright 2023 Government of Canada +Copyright 2024 Government of Canada Licensed under the MIT License (the "License"); you may not use this work except in compliance with the License. You may obtain a copy of the diff --git a/docs/output.md b/docs/output.md index 817c382..4ad48e8 100644 --- a/docs/output.md +++ b/docs/output.md @@ -6,11 +6,13 @@ This document describes the output produced by the pipeline. The directories listed below will be created in the results directory after the pipeline has finished. All paths are relative to the top-level results directory. -- assembly: very small mock assembly files for each sample -- generate: intermediate files used in generating the IRIDA Next JSON output -- pipeline_info: information about the pipeline's execution -- simplify: simplified intermediate files used in generating the IRIDA Next JSON output -- summary: summary report about the pipeline's execution and results +- call: The cluster addresses from the [genomic_address_service](https://github.com/phac-nml/genomic_address_service). +- cluster: The cluster file required by GAS_call. +- distances: Distances between genomes from [profile_dists](https://github.com/phac-nml/profile_dists). +- filter: The cluster addresses from only the query samples. +- input: An error report that is only generated when sample IDs and MLST JSON files do not match. +- locidex: The merged MLST JSON files for reference and query samples. +- pipeline_info: Information about the pipeline's execution The IRIDA Next-compliant JSON output file will be named `iridanext.output.json.gz` and will be written to the top-level of the results directory. This file is compressed using GZIP and conforms to the [IRIDA Next JSON output specifications](https://github.com/phac-nml/pipeline-standards#42-irida-next-json). @@ -18,60 +20,79 @@ The IRIDA Next-compliant JSON output file will be named `iridanext.output.json.g The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes data using the following steps: -- [Assembly stub](#assembly-stub) - Performs a stub assembly by generating a mock assembly -- [Generate sample JSON](#generate-sample-json) - Generates a JSON file for each sample -- [Generate summary](#generate-summary) - Generates a summary text file describing the samples and assemblies -- [Simplify IRIDA JSON](#simplify-irida-json) - Simplifies the sample JSONs by limiting nesting depth +- [Input check](#input-check) - Performs a validation check on the samplesheet inputs to ensure that the sampleID precisely matches the MLST JSON key. +- [Locidex merge](#locidex-merge) - Merges MLST profile JSON files into a single profiles file for reference and query samples. +- [Profile dists](#profile-dists) - Computes pairwise distances between genomes using MLST allele differences. +- [Cluster file](#cluster-file) - Generates the expected_clusters.txt file from reference sample addresses for use in GAS_call. +- [GAS call](#gas-call) - Generates hierarchical cluster addresses. +- [Filter query](#filter-query) - Filters and generates a csv file containing only the cluster addresses for query samples. - [IRIDA Next Output](#irida-next-output) - Generates a JSON output file that is compliant with IRIDA Next - [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution -### Assembly stub +### Input Check
Output files -- `assembly/` - - Mock assembly files: `ID.assembly.fa.gz` +- `input/` + - `sampleID_error_report.csv`
-### Generate sample JSON +### Locidex merge
Output files -- `generate/` - - JSON files: `ID.json.gz` +- `locidex/merge/` + - reference samples: `reference/merged_ref/merged_profiles_ref.tsv` + - query samples: `query/merged_value/merged_profile_value.tsv`
-### Generate summary +### Profile Dists
Output files -- `summary/` - - Text summary describing samples and assemblies: `summary.txt.gz` +- `distances/` + - Mapping allele identifiers to integers: `allele_map.json` + - The query MLST profiles: `query_profile.text` + - The reference MLST profiles: `ref_profile.text` + - The computed distances based on MLST allele differences: `results.text` + - Information on the profile_dists run: `run.json`
-### Simplify IRIDA JSON +### Cluster File
Output files -- `simplify/` - - Simplified JSON files: `ID.simple.json.gz` +- `cluster/` + - `expected_clusters.txt`
-### IRIDA Next Output +### GAS call
Output files -- `/` - - IRIDA Next-compliant JSON output: `iridanext.output.json.gz` +- `call/` + - The computed cluster addresses: `clusters.text` + - Information on the GAS mcluster run: `run.json` + - Thesholds used to compute cluster addresses: `thresholds.json` + +
+ +### Filter Query + +
+Output files + +- `filter/` + - `new_addresses.csv`
diff --git a/docs/usage.md b/docs/usage.md index 4fbd758..4f4abd4 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -2,7 +2,7 @@ ## Introduction -This pipeline is an example that illustrates running a nf-core-compliant pipeline on IRIDA Next. +This workflow takes provided JSON-formatted MLST allelic profiles and assigns cluster addresses to samples based on an existing cluster designations. This pipeline is designed to be integrated into IRIDA Next. However, it may be run as a stand-alone pipeline. ## Samplesheet input @@ -14,22 +14,22 @@ You will need to create a samplesheet with information about the samples you wou ### Full samplesheet -The input samplesheet must contain three columns: `ID`, `fastq_1`, `fastq_2`. The IDs within a samplesheet should be unique. All other columns will be ignored. +The input samplesheet must contain three columns: `sample`, `mlst_alleles`, `address`. The sample names within a samplesheet should be unique. All other columns will be ignored. -A final samplesheet file consisting of both single- and paired-end data may look something like the one below. +A final samplesheet file consisting of mlst_alleles and addresses may look something like the one below: ```csv title="samplesheet.csv" -sample,fastq_1,fastq_2 -SAMPLE1,sample1_R1.fastq.gz,sample1_R2.fastq.gz -SAMPLE2,sample2_R1.fastq.gz,sample2_R2.fastq.gz -SAMPLE3,sample1_R1.fastq.gz, +sample,mlst_alleles,address +sampleA,sampleA.mlst.json.gz,1.1.1 +sampleQ,sampleQ.mlst.json.gz,2.2.2 +sampleF,sampleF.mlst.json, ``` -| Column | Description | -| --------- | -------------------------------------------------------------------------------------------------------------------------- | -| `sample` | Custom sample name. Samples should be unique within a samplesheet. | -| `fastq_1` | Full path to FastQ file for Illumina short reads 1. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz". | -| `fastq_2` | Full path to FastQ file for Illumina short reads 2. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz". | +| Column | Description | +| -------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `sample` | Custom sample name. Samples should be unique within a samplesheet. | +| `mlst_alleles` | Full path to an MLST JSON file describing the loci/alleles for the sample against some MLST scheme. A way to generate this file is via [locidex](https://github.com/phac-nml/locidex). File can optionally be gzipped and must have the extension ".mlst.json", ".mlst.subtyping.json" (or with an additional ".gz" if gzipped). | +| `address` | Hierarchal clustering address. If left empty for a sample, the pipeline will perform de novo clustering based on the provided cluster designations and thresholds. | An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline. From f01e12ccbcf99c096923964bc3c67fadc5059728 Mon Sep 17 00:00:00 2001 From: kylacochrane Date: Mon, 27 May 2024 17:05:48 -0400 Subject: [PATCH 061/119] Update gm_delimiter parameter --- README.md | 2 +- nextflow_schema.json | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 0dd80be..d8344e5 100644 --- a/README.md +++ b/README.md @@ -49,7 +49,7 @@ The following can be used to adjust parameters for the [profile_dists][] tool. The following can be used to adjust parameters for the [gas call][] tool. - `--gm_thresholds`: Thresholds delimited by `,`. Values should match units from `--pd_distm` (either _hamming_ or _scaled_). -- `--gm_delimiter`: Delimiter desired for nomenclature code. +- `--gm_delimiter`: Delimiter desired for nomenclature code. Must be alphanumeric or one of `._-`. ## Other diff --git a/nextflow_schema.json b/nextflow_schema.json index ea67459..5859003 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -21,7 +21,7 @@ "type": "string", "default": "\\'.\\", "description": "Delimiter desired for nomenclature code.", - "pattern": "^\\S+$" + "pattern": "^[A-Fa-f0-9\\._-]+$" } } }, From dabbdecf5c07902dee8fbb4e2873571b977b86fe Mon Sep 17 00:00:00 2001 From: kylacochrane Date: Mon, 27 May 2024 17:23:53 -0400 Subject: [PATCH 062/119] Fixed defaults for gm_delimiter --- nextflow.config | 2 +- nextflow_schema.json | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/nextflow.config b/nextflow.config index a90df40..4cd0af2 100644 --- a/nextflow.config +++ b/nextflow.config @@ -57,7 +57,7 @@ params { // GAS Call gm_thresholds = "10,5,0" - gm_delimiter = "'.'" // note the single quotes surrounding the delimiter + gm_delimiter = "." } diff --git a/nextflow_schema.json b/nextflow_schema.json index 5859003..3cbdf77 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -19,7 +19,7 @@ }, "gm_delimiter": { "type": "string", - "default": "\\'.\\", + "default": ".", "description": "Delimiter desired for nomenclature code.", "pattern": "^[A-Fa-f0-9\\._-]+$" } From 4a199de83a1ef86e804b27ff7555bf70f51d5a3a Mon Sep 17 00:00:00 2001 From: kylacochrane Date: Mon, 27 May 2024 17:26:36 -0400 Subject: [PATCH 063/119] Fix whitespace --- nextflow.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nextflow.config b/nextflow.config index 4cd0af2..a33af43 100644 --- a/nextflow.config +++ b/nextflow.config @@ -57,7 +57,7 @@ params { // GAS Call gm_thresholds = "10,5,0" - gm_delimiter = "." + gm_delimiter = "." } From 1019492794e08d76d947c987c1ed7ca2fc0a330e Mon Sep 17 00:00:00 2001 From: kylacochrane Date: Tue, 28 May 2024 17:06:03 -0400 Subject: [PATCH 064/119] Update CLUSTER_FILE to split on gm_delimiter --- modules/local/cluster_file/main.nf | 6 +++--- nextflow.config | 2 +- nextflow_schema.json | 4 +++- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/modules/local/cluster_file/main.nf b/modules/local/cluster_file/main.nf index 3ed44e8..3c8c04f 100644 --- a/modules/local/cluster_file/main.nf +++ b/modules/local/cluster_file/main.nf @@ -10,9 +10,9 @@ process CLUSTER_FILE { exec: def outputLines = [] - + print "${params.gm_delimiter}" // Determine the maximum number of levels to set the header requirements for each pipeline run - int maxLevels = meta.collect { sample -> sample.address.split("\\.").size() }.max() ?: 0 + int maxLevels = meta.collect { sample -> sample.address.split("\\$params.gm_delimiter").size() }.max() ?: 0 // Generate the header def header = ["id", "address"] + (1..maxLevels).collect { "level_$it" } @@ -22,7 +22,7 @@ process CLUSTER_FILE { meta.each { sample -> def id = sample.id def address = sample.address - def levels = address.split("\\.") + def levels = address.split("\\$params.gm_delimiter") def line = [id, address] + levels.collect { it.toString() } + (levels.size().. Date: Wed, 29 May 2024 16:58:08 -0400 Subject: [PATCH 065/119] Raise exception if sample levels are different --- modules/local/cluster_file/main.nf | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/modules/local/cluster_file/main.nf b/modules/local/cluster_file/main.nf index 3c8c04f..3e0213c 100644 --- a/modules/local/cluster_file/main.nf +++ b/modules/local/cluster_file/main.nf @@ -10,11 +10,19 @@ process CLUSTER_FILE { exec: def outputLines = [] - print "${params.gm_delimiter}" + // Determine the maximum number of levels to set the header requirements for each pipeline run int maxLevels = meta.collect { sample -> sample.address.split("\\$params.gm_delimiter").size() }.max() ?: 0 - // Generate the header + // Verify each sample is consistent with $maxLevels + meta.each { sample -> + int level = sample.address.split("\\$params.gm_delimiter").size() + if (level != maxLevels) { + throw new Exception("Inconsistent levels found: expected $maxLevels but found $level in the following input sample: ${sample.id}") + } + } + + // Generate the header for the expected_clusters.txt file def header = ["id", "address"] + (1..maxLevels).collect { "level_$it" } outputLines << header.join("\t") @@ -27,7 +35,7 @@ process CLUSTER_FILE { outputLines << line.join("\t") } - // Write the text file + // Write the text file, iterating over each sample task.workDir.resolve("expected_clusters.txt").withWriter { writer -> outputLines.each { line -> writer.writeLine(line) From 2758cd0c6a50c2c04ceda68e043cf427a7019e29 Mon Sep 17 00:00:00 2001 From: kylacochrane Date: Thu, 30 May 2024 13:01:29 -0400 Subject: [PATCH 066/119] Updated cluster_file process to use 'error' instead of 'throw Exception' --- modules/local/cluster_file/main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/local/cluster_file/main.nf b/modules/local/cluster_file/main.nf index 3e0213c..5b049c6 100644 --- a/modules/local/cluster_file/main.nf +++ b/modules/local/cluster_file/main.nf @@ -18,7 +18,7 @@ process CLUSTER_FILE { meta.each { sample -> int level = sample.address.split("\\$params.gm_delimiter").size() if (level != maxLevels) { - throw new Exception("Inconsistent levels found: expected $maxLevels but found $level in the following input sample: ${sample.id}") + error ("Inconsistent levels found: expected $maxLevels levels but found $level levels in ${sample.id}") } } From bd0cd1a1a14dcf6b3408f6fe8a8b3dff418e6021 Mon Sep 17 00:00:00 2001 From: kylacochrane Date: Thu, 30 May 2024 13:03:10 -0400 Subject: [PATCH 067/119] Added a cluster_file process test to evaluate samples with different levels --- tests/modules/cluster_file/main.nf.test | 29 ++++++ tests/pipelines/gasnomenclature.nf.test | 112 ++++++++++++++++++++++++ 2 files changed, 141 insertions(+) create mode 100644 tests/modules/cluster_file/main.nf.test create mode 100644 tests/pipelines/gasnomenclature.nf.test diff --git a/tests/modules/cluster_file/main.nf.test b/tests/modules/cluster_file/main.nf.test new file mode 100644 index 0000000..d46615d --- /dev/null +++ b/tests/modules/cluster_file/main.nf.test @@ -0,0 +1,29 @@ +nextflow_process { + name "Test Process CLUSTER_FILE" + script "modules/local/cluster_file/main.nf" + process "CLUSTER_FILE" + + test("Test when sample levels are different") { + + when { + process { + """ + input[0] = Channel.of( + [['id':'sample1', 'address':'1.1.1'], + ['id':'sample2', 'address':'1.1.1'], + ['id':'sample3', 'address':'1.2']] + ) + """ + } + + params { + outdir = "clusterfile_test_out" + } + } + + then { + assert process.failed + assert (process.stdout =~ /Inconsistent levels found: expected 3 levels but found 2 levels in sample3/).find() + } + } +} diff --git a/tests/pipelines/gasnomenclature.nf.test b/tests/pipelines/gasnomenclature.nf.test new file mode 100644 index 0000000..930cf22 --- /dev/null +++ b/tests/pipelines/gasnomenclature.nf.test @@ -0,0 +1,112 @@ +nextflow_pipeline { + + name "Integration test of nomenclature assignment pipeline" + script "main.nf" + + test("Small-scale test of full pipeline"){ + tag "pipeline_success" + + when{ + params { + input = "$baseDir/tests/data/samplesheets/samplesheet1.csv" + outdir = "results" + } + } + + then { + assert workflow.success + assert path("$launchDir/results").exists() + + // Check merged profiles + // TODO check query profile is merged + def actual_profile_ref = path("$launchDir/results/locidex/merge/reference/merged_ref/merged_profiles_ref.tsv") + def expected_profile_tsv = path("$baseDir/tests/data/profiles/expected-profile1.tsv") + assert actual_profile_ref.text == expected_profile_tsv.text + + // Check computed pairwise distances + def actual_distances = path("$launchDir/results/distances/results.text") + def expected_distances = path("$baseDir/tests/data/distances/expected_pairwise_dists.txt") + assert actual_distances.text == expected_distances.text + + // Check called clusters + def actual_calls = path("$launchDir/results/call/Called/results.text") + def expected_calls = path("$baseDir/tests/data/called/expected_results.txt") + assert actual_calls.text == expected_calls.text + + // Check IRIDA Next JSON output + assert path("$launchDir/results/iridanext.output.json").json == path("$baseDir/tests/data/irida/test_iridanext.output.json").json + + def iridanext_json = path("$launchDir/results/iridanext.output.json").json + def iridanext_samples = iridanext_json.files.samples + def iridanext_metadata = iridanext_json.metadata.samples + + assert iridanext_metadata.size() == 1 && iridanext_metadata.containsKey("sampleQ") + assert iridanext_metadata.sampleQ."address" == "1.1.3" + } + } + + test("Integration test where input contains reference sample with mismatched MLST JSON file"){ + tag "pipeline_failure" + + when { + params { + input = "$baseDir/tests/data/samplesheets/samplesheet_test1.csv" + outdir = "results" + } + } + + then { + assert workflow.failed + assert (workflow.stdout =~ /Pipeline exiting: sample with ID sample2 does not have matching MLST JSON file./).find() + + assert path("$launchDir/results").exists() + assert path("$launchDir/results/input").exists() + + // Ensure that despite pipeline failure, error_reports are generated for all samples added to pipeline (i.e. sampleQ query) + def lines = [] + + lines = path("$launchDir/results/input/sample2_error_report.csv").readLines() + assert lines.contains("sample2,sample7,Pipeline stopped: Reference sample2's input ID and MLST JSON file key DO NOT MATCH") + } + } + + test("Integration test where input contains a single query sample with mismatched MLST JSON file"){ + tag "pipeline_success_after_query_removal" + + when{ + params { + input = "$baseDir/tests/data/samplesheets/samplesheet_test2.csv" + outdir = "results" + } + } + + then { + assert workflow.success + assert path("$launchDir/results").exists() + assert path("$launchDir/results/input").exists() + assert path("$launchDir/results/filter").exists() + + // Check outputs + def lines = [] + + // Ensure that the error_report is generated for removed query sampleR + lines = path("$launchDir/results/input/sampleR_error_report.csv").readLines() + assert lines.contains("sampleR,sampleF,Query sampleR removed from pipeline") + + // Check query output csv + lines = path("$launchDir/results/filter/new_addresses.csv").readLines() + assert lines.contains("sampleQ,1.1.3") + + // Check IRIDA Next JSON output + assert path("$launchDir/results/iridanext.output.json").json == path("$baseDir/tests/data/irida/test2_iridanext.output.json").json + + def iridanext_json = path("$launchDir/results/iridanext.output.json").json + def iridanext_samples = iridanext_json.files.samples + def iridanext_metadata = iridanext_json.metadata.samples + + assert iridanext_samples.sampleR.findAll { it.path == "input/sampleR_error_report.csv" }.size() == 1 + assert iridanext_metadata.sampleQ."address" == "1.1.3" + } + } + +} From 6b7243a5735a625cf6c437021231d3792ce88578 Mon Sep 17 00:00:00 2001 From: kylacochrane Date: Tue, 4 Jun 2024 09:55:01 -0400 Subject: [PATCH 068/119] Add clustering method --- README.md | 1 + modules/local/gas/call/main.nf | 1 + nextflow.config | 1 + nextflow_schema.json | 6 ++++++ 4 files changed, 9 insertions(+) diff --git a/README.md b/README.md index d8344e5..4e24c8e 100644 --- a/README.md +++ b/README.md @@ -49,6 +49,7 @@ The following can be used to adjust parameters for the [profile_dists][] tool. The following can be used to adjust parameters for the [gas call][] tool. - `--gm_thresholds`: Thresholds delimited by `,`. Values should match units from `--pd_distm` (either _hamming_ or _scaled_). +- `--gm_method`: The linkage method to use for clustering. Value should be one of _single_, _average_, or _complete_. - `--gm_delimiter`: Delimiter desired for nomenclature code. Must be alphanumeric or one of `._-`. ## Other diff --git a/modules/local/gas/call/main.nf b/modules/local/gas/call/main.nf index 33db7a7..56f7d92 100644 --- a/modules/local/gas/call/main.nf +++ b/modules/local/gas/call/main.nf @@ -26,6 +26,7 @@ process GAS_CALL{ gas call --dists $distances \\ --rclusters $reference_clusters \\ --outdir ${prefix} \\ + --method ${params.gm_method} \\ --threshold ${params.gm_thresholds} \\ --delimeter ${params.gm_delimiter} diff --git a/nextflow.config b/nextflow.config index a33af43..31a7a17 100644 --- a/nextflow.config +++ b/nextflow.config @@ -57,6 +57,7 @@ params { // GAS Call gm_thresholds = "10,5,0" + gm_method = "average" gm_delimiter = "." } diff --git a/nextflow_schema.json b/nextflow_schema.json index 3cbdf77..99b1c29 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -17,6 +17,12 @@ "description": "Thresholds delimited by ','. Values should match units from '--pd_distm' (either 'hamming' or 'scaled').", "pattern": "^(\\d+(\\.\\d+)?,)*\\d+(\\.\\d+)?$" }, + "gm_method": { + "type": "string", + "default": "average", + "description": "Clustering linkage method.", + "enum": ["single", "average", "complete"] + }, "gm_delimiter": { "type": "string", "default": ".", From e32f5d26feb65d0f5500fd630a648e67196a1670 Mon Sep 17 00:00:00 2001 From: kylacochrane Date: Tue, 4 Jun 2024 12:50:45 -0400 Subject: [PATCH 069/119] Comment removed from gas-call main.nf --- modules/local/gas/call/main.nf | 1 - 1 file changed, 1 deletion(-) diff --git a/modules/local/gas/call/main.nf b/modules/local/gas/call/main.nf index 56f7d92..3216c9e 100644 --- a/modules/local/gas/call/main.nf +++ b/modules/local/gas/call/main.nf @@ -20,7 +20,6 @@ process GAS_CALL{ path "versions.yml", emit: versions script: - // Need to add more args for gas call below prefix = "Called" """ gas call --dists $distances \\ From 20f55224e9f6d7af2eb6913288159ef981a2b263 Mon Sep 17 00:00:00 2001 From: kylacochrane Date: Thu, 6 Jun 2024 13:55:33 -0400 Subject: [PATCH 070/119] Removed padding in cluster_file process --- modules/local/cluster_file/main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/local/cluster_file/main.nf b/modules/local/cluster_file/main.nf index 5b049c6..dfb18b3 100644 --- a/modules/local/cluster_file/main.nf +++ b/modules/local/cluster_file/main.nf @@ -31,7 +31,7 @@ process CLUSTER_FILE { def id = sample.id def address = sample.address def levels = address.split("\\$params.gm_delimiter") - def line = [id, address] + levels.collect { it.toString() } + (levels.size().. Date: Thu, 6 Jun 2024 13:58:28 -0400 Subject: [PATCH 071/119] nextflow.config comment clean up --- nextflow.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nextflow.config b/nextflow.config index 5d1cecc..454fc13 100644 --- a/nextflow.config +++ b/nextflow.config @@ -62,7 +62,7 @@ params { // GAS Call gm_thresholds = "10,5,0" - gm_delimiter = "." // note the single quotes surrounding the delimiter + gm_delimiter = "." ref_clusters = "" } From be3ba2671b0e9358d3031c917560bfe7539bc5dc Mon Sep 17 00:00:00 2001 From: kylacochrane Date: Thu, 6 Jun 2024 16:45:58 -0400 Subject: [PATCH 072/119] Revise 'input_check' to 'input_assure'; enforce JSON key alteration to match the sample ID if a mismatch is detected --- bin/input_check.py | 24 +++++++++++++++---- .../{input_check => input_assure}/main.nf | 2 +- workflows/gas_nomenclature.nf | 19 ++++----------- 3 files changed, 24 insertions(+), 21 deletions(-) rename modules/local/{input_check => input_assure}/main.nf (97%) diff --git a/bin/input_check.py b/bin/input_check.py index a21f0c7..bd7ac70 100755 --- a/bin/input_check.py +++ b/bin/input_check.py @@ -4,11 +4,18 @@ import argparse import sys import csv +import gzip +def open_file(file_path, mode): + # Open a file based on the file extension + if file_path.endswith('.gz'): + return gzip.open(file_path, mode) + else: + return open(file_path, mode) def check_inputs(json_file, sample_id, address, output_match_file, output_error_file): # Define a variable to store the match_status (True or False) - with open(json_file, "r") as f: + with open(json_file, "rt") as f: json_data = json.load(f) match_status = sample_id in json_data @@ -16,23 +23,30 @@ def check_inputs(json_file, sample_id, address, output_match_file, output_error_ with open(output_match_file, "w") as f: f.write(str(match_status)) + # Define the original key in the JSON data + original_key = list(json_data.keys())[0] + # Define error message based on meta.address (query or reference) if address == "null": - error_message = f"Query {sample_id} removed from pipeline" + error_message = f"Query {sample_id} ID did not match the JSON key in {json_file} - User must manually check input files to ensure correctness." else: - error_message = f"Pipeline stopped: Reference {sample_id}'s input ID and MLST JSON file key DO NOT MATCH" + error_message = f"Reference {sample_id}'s sample ID and JSON key in {json_file} DO NOT MATCH: the '{original_key}' key in {json_file} has been forcefully changed to '{sample_id}'." # Write sample ID and JSON key to error report CSV if not matched; include error message if not match_status: with open(output_error_file, "w", newline="") as f: writer = csv.writer(f) writer.writerow(["sample", "JSON_key", "error_message"]) - writer.writerow([sample_id, list(json_data.keys())[0], error_message]) + writer.writerow([sample_id, original_key, error_message]) + # Update the JSON file with the new sample ID + json_data[sample_id] = json_data.pop(original_key) + with open(json_file, "wt") as f: + json.dump(json_data, f, indent=4) if __name__ == "__main__": parser = argparse.ArgumentParser( - description="Check sample inputs and generate an error report." + description="Check sample inputs, force change if ID ≠ KEY, and generate an error report." ) parser.add_argument("--input", help="Path to the mlst.json file.", required=True) parser.add_argument( diff --git a/modules/local/input_check/main.nf b/modules/local/input_assure/main.nf similarity index 97% rename from modules/local/input_check/main.nf rename to modules/local/input_assure/main.nf index 79a2242..90260a2 100644 --- a/modules/local/input_check/main.nf +++ b/modules/local/input_assure/main.nf @@ -1,4 +1,4 @@ -process INPUT_CHECK{ +process INPUT_ASSURE { tag "Check Sample Inputs and Generate Error Report" label 'process_single' diff --git a/workflows/gas_nomenclature.nf b/workflows/gas_nomenclature.nf index b0fb977..dfa029d 100644 --- a/workflows/gas_nomenclature.nf +++ b/workflows/gas_nomenclature.nf @@ -22,7 +22,7 @@ include { paramsSummaryLog; paramsSummaryMap; fromSamplesheet } from 'plugin/nf // SUBWORKFLOW: Consisting of a mix of local and nf-core/modules // -include { INPUT_CHECK } from "../modules/local/input_check/main" +include { INPUT_ASSURE } from "../modules/local/input_assure/main" include { LOCIDEX_MERGE as LOCIDEX_MERGE_REF } from "../modules/local/locidex/merge/main" include { LOCIDEX_MERGE as LOCIDEX_MERGE_QUERY } from "../modules/local/locidex/merge/main" include { PROFILE_DISTS } from "../modules/local/profile_dists/main" @@ -72,7 +72,7 @@ workflow GAS_NOMENCLATURE { input = Channel.fromSamplesheet("input") // Ensure meta.id and mlst_file keys match; generate error report for samples where id ≠ key - id_key = INPUT_CHECK(input) + id_key = INPUT_ASSURE(input) ch_versions = ch_versions.mix(id_key.versions) // Update metadata to include the id_key.match data @@ -80,21 +80,10 @@ workflow GAS_NOMENCLATURE { def id_match = file.text.trim() [meta + [id_match: id_match == 'True'], json] } - - // If samples have a disparity between meta.id and JSON key: Exclude the queried samples OR halt the pipeline with an error if sample has an associated cluster address (reference) - new_input = match.filter { meta, json -> - if (meta.id_match) { - return true // Keep the sample - } else if (meta.address == null && !meta.id_match) { - return false // Remove the sample - } else if (meta.address != null && !meta.id_match) { - // Exit with error statement - throw new RuntimeException("Pipeline exiting: sample with ID ${meta.id} does not have matching MLST JSON file.") - } - } + match.view() // Prepare reference and query TSV files for LOCIDEX_MERGE - profiles = new_input.branch{ + profiles = match.branch { query: !it[0].address } reference_values = input.collect{ meta, profile -> profile} From 95e40f6979ab60cbe6f8531bb72996c45cf1c56f Mon Sep 17 00:00:00 2001 From: kylacochrane Date: Fri, 7 Jun 2024 08:47:18 -0400 Subject: [PATCH 073/119] Remove id_match from meta --- bin/input_check.py | 11 ++--------- modules/local/input_assure/main.nf | 5 ++--- workflows/gas_nomenclature.nf | 9 +-------- 3 files changed, 5 insertions(+), 20 deletions(-) diff --git a/bin/input_check.py b/bin/input_check.py index bd7ac70..19c099d 100755 --- a/bin/input_check.py +++ b/bin/input_check.py @@ -13,16 +13,12 @@ def open_file(file_path, mode): else: return open(file_path, mode) -def check_inputs(json_file, sample_id, address, output_match_file, output_error_file): +def check_inputs(json_file, sample_id, address, output_error_file): # Define a variable to store the match_status (True or False) with open(json_file, "rt") as f: json_data = json.load(f) match_status = sample_id in json_data - # Write match status to file - with open(output_match_file, "w") as f: - f.write(str(match_status)) - # Define the original key in the JSON data original_key = list(json_data.keys())[0] @@ -58,12 +54,9 @@ def check_inputs(json_file, sample_id, address, output_match_file, output_error_ parser.add_argument( "--output_error", help="Path to the error report file.", required=True ) - parser.add_argument( - "--output_match", help="Path to the match status file.", required=True - ) args = parser.parse_args() check_inputs( - args.input, args.sample_id, args.address, args.output_match, args.output_error + args.input, args.sample_id, args.address, args.output_error ) diff --git a/modules/local/input_assure/main.nf b/modules/local/input_assure/main.nf index 90260a2..1b22242 100644 --- a/modules/local/input_assure/main.nf +++ b/modules/local/input_assure/main.nf @@ -10,7 +10,7 @@ process INPUT_ASSURE { tuple val(meta), path(mlst) output: - tuple val(meta), path("${meta.id}_match.txt"), path(mlst), emit: match + tuple val(meta), path(mlst), emit: match tuple val(meta), path("*_error_report.csv"), optional: true, emit: error_report path("versions.yml"), emit: versions @@ -21,8 +21,7 @@ process INPUT_ASSURE { --input ${mlst} \\ --sample_id ${meta.id} \\ --address ${meta.address} \\ - --output_error ${meta.id}_error_report.csv \\ - --output_match ${meta.id}_match.txt + --output_error ${meta.id}_error_report.csv cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/workflows/gas_nomenclature.nf b/workflows/gas_nomenclature.nf index dfa029d..d527777 100644 --- a/workflows/gas_nomenclature.nf +++ b/workflows/gas_nomenclature.nf @@ -75,15 +75,8 @@ workflow GAS_NOMENCLATURE { id_key = INPUT_ASSURE(input) ch_versions = ch_versions.mix(id_key.versions) - // Update metadata to include the id_key.match data - match = id_key.match.map { meta, file, json -> - def id_match = file.text.trim() - [meta + [id_match: id_match == 'True'], json] - } - match.view() - // Prepare reference and query TSV files for LOCIDEX_MERGE - profiles = match.branch { + profiles = id_key.match.branch { query: !it[0].address } reference_values = input.collect{ meta, profile -> profile} From 9e20417c028621a6c18fd8011aef9bfbb1885956 Mon Sep 17 00:00:00 2001 From: kylacochrane Date: Fri, 7 Jun 2024 08:49:36 -0400 Subject: [PATCH 074/119] Fix linting --- modules/local/input_assure/main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/local/input_assure/main.nf b/modules/local/input_assure/main.nf index 1b22242..c01c319 100644 --- a/modules/local/input_assure/main.nf +++ b/modules/local/input_assure/main.nf @@ -21,7 +21,7 @@ process INPUT_ASSURE { --input ${mlst} \\ --sample_id ${meta.id} \\ --address ${meta.address} \\ - --output_error ${meta.id}_error_report.csv + --output_error ${meta.id}_error_report.csv cat <<-END_VERSIONS > versions.yml "${task.process}": From deb43495bf65e0e599d4f645f6ca389f6698fcdc Mon Sep 17 00:00:00 2001 From: kylacochrane Date: Fri, 7 Jun 2024 09:26:53 -0400 Subject: [PATCH 075/119] Updated error_message from input_assure --- bin/input_check.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bin/input_check.py b/bin/input_check.py index 19c099d..5c8365d 100755 --- a/bin/input_check.py +++ b/bin/input_check.py @@ -24,9 +24,9 @@ def check_inputs(json_file, sample_id, address, output_error_file): # Define error message based on meta.address (query or reference) if address == "null": - error_message = f"Query {sample_id} ID did not match the JSON key in {json_file} - User must manually check input files to ensure correctness." + error_message = f"Query {sample_id} ID and JSON key in {json_file} DO NOT MATCH. The '{original_key}' key in {json_file} has been forcefully changed to '{sample_id}': User should manually check input files to ensure correctness." else: - error_message = f"Reference {sample_id}'s sample ID and JSON key in {json_file} DO NOT MATCH: the '{original_key}' key in {json_file} has been forcefully changed to '{sample_id}'." + error_message = f"Reference {sample_id} ID and JSON key in {json_file} DO NOT MATCH. The '{original_key}' key in {json_file} has been forcefully changed to '{sample_id}': User should manually check input files to ensure correctness." # Write sample ID and JSON key to error report CSV if not matched; include error message if not match_status: From 358175e565b80330d1f4ce1686dd111b35fe8fad Mon Sep 17 00:00:00 2001 From: kylacochrane Date: Fri, 7 Jun 2024 13:41:13 -0400 Subject: [PATCH 076/119] fix: correctly escape delimiter in CLUSTER_FILE process --- modules/local/cluster_file/main.nf | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/modules/local/cluster_file/main.nf b/modules/local/cluster_file/main.nf index dfb18b3..dfb8004 100644 --- a/modules/local/cluster_file/main.nf +++ b/modules/local/cluster_file/main.nf @@ -10,13 +10,14 @@ process CLUSTER_FILE { exec: def outputLines = [] + def delimiter = java.util.regex.Pattern.quote(params.gm_delimiter) // Determine the maximum number of levels to set the header requirements for each pipeline run - int maxLevels = meta.collect { sample -> sample.address.split("\\$params.gm_delimiter").size() }.max() ?: 0 + int maxLevels = meta.collect { sample -> sample.address.split(delimiter).size() }.max() ?: 0 // Verify each sample is consistent with $maxLevels meta.each { sample -> - int level = sample.address.split("\\$params.gm_delimiter").size() + int level = sample.address.split(delimiter).size() if (level != maxLevels) { error ("Inconsistent levels found: expected $maxLevels levels but found $level levels in ${sample.id}") } @@ -30,7 +31,7 @@ process CLUSTER_FILE { meta.each { sample -> def id = sample.id def address = sample.address - def levels = address.split("\\$params.gm_delimiter") + def levels = address.split(delimiter) def line = [id, address] + levels.collect { it.toString() } outputLines << line.join("\t") } From b4cbd03c95189a1dd585fd81938b0138587dc1ea Mon Sep 17 00:00:00 2001 From: kylacochrane Date: Fri, 7 Jun 2024 14:03:26 -0400 Subject: [PATCH 077/119] Add check to main.nf.test to assert query profile is merged --- tests/data/profiles/expected-profile2.tsv | 2 + tests/pipelines/gasnomenclature.nf.test | 112 ---------------------- tests/pipelines/main.nf.test | 6 +- 3 files changed, 7 insertions(+), 113 deletions(-) create mode 100644 tests/data/profiles/expected-profile2.tsv delete mode 100644 tests/pipelines/gasnomenclature.nf.test diff --git a/tests/data/profiles/expected-profile2.tsv b/tests/data/profiles/expected-profile2.tsv new file mode 100644 index 0000000..44020cb --- /dev/null +++ b/tests/data/profiles/expected-profile2.tsv @@ -0,0 +1,2 @@ +sample_id l1 l2 l3 +sampleQ 1 2 1 diff --git a/tests/pipelines/gasnomenclature.nf.test b/tests/pipelines/gasnomenclature.nf.test deleted file mode 100644 index 930cf22..0000000 --- a/tests/pipelines/gasnomenclature.nf.test +++ /dev/null @@ -1,112 +0,0 @@ -nextflow_pipeline { - - name "Integration test of nomenclature assignment pipeline" - script "main.nf" - - test("Small-scale test of full pipeline"){ - tag "pipeline_success" - - when{ - params { - input = "$baseDir/tests/data/samplesheets/samplesheet1.csv" - outdir = "results" - } - } - - then { - assert workflow.success - assert path("$launchDir/results").exists() - - // Check merged profiles - // TODO check query profile is merged - def actual_profile_ref = path("$launchDir/results/locidex/merge/reference/merged_ref/merged_profiles_ref.tsv") - def expected_profile_tsv = path("$baseDir/tests/data/profiles/expected-profile1.tsv") - assert actual_profile_ref.text == expected_profile_tsv.text - - // Check computed pairwise distances - def actual_distances = path("$launchDir/results/distances/results.text") - def expected_distances = path("$baseDir/tests/data/distances/expected_pairwise_dists.txt") - assert actual_distances.text == expected_distances.text - - // Check called clusters - def actual_calls = path("$launchDir/results/call/Called/results.text") - def expected_calls = path("$baseDir/tests/data/called/expected_results.txt") - assert actual_calls.text == expected_calls.text - - // Check IRIDA Next JSON output - assert path("$launchDir/results/iridanext.output.json").json == path("$baseDir/tests/data/irida/test_iridanext.output.json").json - - def iridanext_json = path("$launchDir/results/iridanext.output.json").json - def iridanext_samples = iridanext_json.files.samples - def iridanext_metadata = iridanext_json.metadata.samples - - assert iridanext_metadata.size() == 1 && iridanext_metadata.containsKey("sampleQ") - assert iridanext_metadata.sampleQ."address" == "1.1.3" - } - } - - test("Integration test where input contains reference sample with mismatched MLST JSON file"){ - tag "pipeline_failure" - - when { - params { - input = "$baseDir/tests/data/samplesheets/samplesheet_test1.csv" - outdir = "results" - } - } - - then { - assert workflow.failed - assert (workflow.stdout =~ /Pipeline exiting: sample with ID sample2 does not have matching MLST JSON file./).find() - - assert path("$launchDir/results").exists() - assert path("$launchDir/results/input").exists() - - // Ensure that despite pipeline failure, error_reports are generated for all samples added to pipeline (i.e. sampleQ query) - def lines = [] - - lines = path("$launchDir/results/input/sample2_error_report.csv").readLines() - assert lines.contains("sample2,sample7,Pipeline stopped: Reference sample2's input ID and MLST JSON file key DO NOT MATCH") - } - } - - test("Integration test where input contains a single query sample with mismatched MLST JSON file"){ - tag "pipeline_success_after_query_removal" - - when{ - params { - input = "$baseDir/tests/data/samplesheets/samplesheet_test2.csv" - outdir = "results" - } - } - - then { - assert workflow.success - assert path("$launchDir/results").exists() - assert path("$launchDir/results/input").exists() - assert path("$launchDir/results/filter").exists() - - // Check outputs - def lines = [] - - // Ensure that the error_report is generated for removed query sampleR - lines = path("$launchDir/results/input/sampleR_error_report.csv").readLines() - assert lines.contains("sampleR,sampleF,Query sampleR removed from pipeline") - - // Check query output csv - lines = path("$launchDir/results/filter/new_addresses.csv").readLines() - assert lines.contains("sampleQ,1.1.3") - - // Check IRIDA Next JSON output - assert path("$launchDir/results/iridanext.output.json").json == path("$baseDir/tests/data/irida/test2_iridanext.output.json").json - - def iridanext_json = path("$launchDir/results/iridanext.output.json").json - def iridanext_samples = iridanext_json.files.samples - def iridanext_metadata = iridanext_json.metadata.samples - - assert iridanext_samples.sampleR.findAll { it.path == "input/sampleR_error_report.csv" }.size() == 1 - assert iridanext_metadata.sampleQ."address" == "1.1.3" - } - } - -} diff --git a/tests/pipelines/main.nf.test b/tests/pipelines/main.nf.test index 930cf22..e35a9c3 100644 --- a/tests/pipelines/main.nf.test +++ b/tests/pipelines/main.nf.test @@ -18,11 +18,15 @@ nextflow_pipeline { assert path("$launchDir/results").exists() // Check merged profiles - // TODO check query profile is merged def actual_profile_ref = path("$launchDir/results/locidex/merge/reference/merged_ref/merged_profiles_ref.tsv") def expected_profile_tsv = path("$baseDir/tests/data/profiles/expected-profile1.tsv") assert actual_profile_ref.text == expected_profile_tsv.text + // Check query profiles + def actual_profile_query = path("$launchDir/results/locidex/merge/query/merged_value/merged_profiles_value.tsv") + def expected_profile_query_tsv = path("$baseDir/tests/data/profiles/expected-profile2.tsv") + assert actual_profile_query.text == expected_profile_query_tsv.text + // Check computed pairwise distances def actual_distances = path("$launchDir/results/distances/results.text") def expected_distances = path("$baseDir/tests/data/distances/expected_pairwise_dists.txt") From 6691dea10c7140403b859fb97ea9a2c4cc79149b Mon Sep 17 00:00:00 2001 From: kylacochrane Date: Fri, 7 Jun 2024 14:26:42 -0400 Subject: [PATCH 078/119] Added success case to cluster_file process test --- tests/data/clusters/expected_clusters.txt | 2 +- tests/modules/cluster_file/main.nf.test | 31 ++++++++++++++++++++++- 2 files changed, 31 insertions(+), 2 deletions(-) diff --git a/tests/data/clusters/expected_clusters.txt b/tests/data/clusters/expected_clusters.txt index c4adfe5..362ea84 100644 --- a/tests/data/clusters/expected_clusters.txt +++ b/tests/data/clusters/expected_clusters.txt @@ -1,4 +1,4 @@ id address level_1 level_2 level_3 sample1 1.1.1 1 1 1 sample2 1.1.1 1 1 1 -sample3 2.2.2 2 2 2 +sample3 1.1.2 1 1 2 diff --git a/tests/modules/cluster_file/main.nf.test b/tests/modules/cluster_file/main.nf.test index d46615d..43fd71c 100644 --- a/tests/modules/cluster_file/main.nf.test +++ b/tests/modules/cluster_file/main.nf.test @@ -3,6 +3,35 @@ nextflow_process { script "modules/local/cluster_file/main.nf" process "CLUSTER_FILE" + test("Test when sample levels are equal") { + + when { + process { + """ + input[0] = Channel.of( + [['id':'sample1', 'address':'1.1.1'], + ['id':'sample2', 'address':'1.1.1'], + ['id':'sample3', 'address':'1.1.2']] + ) + """ + } + + params { + outdir = "cluster_results" + } + } + + then { + assert process.success + assert path("$launchDir/cluster_results").exists() + + // Check expected_clusters + def actual_clusters = path("$launchDir/cluster_results/cluster/expected_clusters.txt") + def expected_clusters = path("$baseDir/tests/data/clusters/expected_clusters.txt") + assert actual_clusters.text == expected_clusters.text + } + } + test("Test when sample levels are different") { when { @@ -17,7 +46,7 @@ nextflow_process { } params { - outdir = "clusterfile_test_out" + outdir = "cluster_results" } } From a1b36852f018497b8fd50144b7031a1ae7f97876 Mon Sep 17 00:00:00 2001 From: kylacochrane Date: Fri, 7 Jun 2024 15:01:10 -0400 Subject: [PATCH 079/119] Update README and USAGE documents --- README.md | 3 +-- docs/usage.md | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 4e24c8e..4aa8b43 100644 --- a/README.md +++ b/README.md @@ -28,14 +28,13 @@ The main parameters are `--input` as defined above and `--output` for specifying The following can be used to adjust parameters for the [profile_dists][] tool. -- `--pd_outfmt`: The output format for distances. For this pipeline the only valid value is _pairwise_ (required by [gas call][]). - `--pd_distm`: The distance method/unit, either _hamming_ or _scaled_. For _hamming_ distances, the distance values will be a non-negative integer. For _scaled_ distances, the distance values are between 0 and 1. - `--pd_missing_threshold`: The maximum proportion of missing data per locus for a locus to be kept in the analysis. Values from 0 to 1. - `--pd_sample_quality_threshold`: The maximum proportion of missing data per sample for a sample to be kept in the analysis. Values from 0 to 1. - `--pd_file_type`: Output format file type. One of _text_ or _parquet_. - `--pd_mapping_file`: A file used to map allele codes to integers for internal distance calculations. This is the same file as produced from the _profile dists_ step (the [allele_map.json](docs/output.md#profile-dists) file). Normally, this is unneeded unless you wish to override the automated process of mapping alleles to integers. - `--pd_skip`: Skip QA/QC steps. Can be used as a flag, `--pd_skip`, or passing a boolean, `--pd_skip true` or `--pd_skip false`. -- `--pd_columns`: Defines the loci to keep within the analysis (default when unset is to keep all loci). Formatted as a single column file with one locus name per line. For example: +- `--pd_columns`: Path to a file that defines the loci to keep within the analysis (default when unset is to keep all loci). Formatted as a single column file with one locus name per line. For example: - **Single column format** ``` loci1 diff --git a/docs/usage.md b/docs/usage.md index 4f4abd4..2433443 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -29,7 +29,7 @@ sampleF,sampleF.mlst.json, | -------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | `sample` | Custom sample name. Samples should be unique within a samplesheet. | | `mlst_alleles` | Full path to an MLST JSON file describing the loci/alleles for the sample against some MLST scheme. A way to generate this file is via [locidex](https://github.com/phac-nml/locidex). File can optionally be gzipped and must have the extension ".mlst.json", ".mlst.subtyping.json" (or with an additional ".gz" if gzipped). | -| `address` | Hierarchal clustering address. If left empty for a sample, the pipeline will perform de novo clustering based on the provided cluster designations and thresholds. | +| `address` | Hierarchal clustering address. If left empty for a sample, the pipeline will assign a cluster address. | An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline. From 04ee4fd8044719d6d76a2c786266337af7d829b3 Mon Sep 17 00:00:00 2001 From: kylacochrane Date: Fri, 7 Jun 2024 15:21:02 -0400 Subject: [PATCH 080/119] Remove the --pd_outfmt paramater --- modules/local/profile_dists/main.nf | 6 +++--- nextflow.config | 1 - nextflow_schema.json | 7 ------- workflows/gas_nomenclature.nf | 3 --- 4 files changed, 3 insertions(+), 14 deletions(-) diff --git a/modules/local/profile_dists/main.nf b/modules/local/profile_dists/main.nf index 3d6845c..f43d63b 100644 --- a/modules/local/profile_dists/main.nf +++ b/modules/local/profile_dists/main.nf @@ -9,7 +9,6 @@ process PROFILE_DISTS{ input: path query path ref - val mapping_format path mapping_file path columns @@ -39,9 +38,10 @@ process PROFILE_DISTS{ args = args + " --count_missing" } // --match_threshold $params.profile_dists.match_thresh \\ - prefix = "distances_${mapping_format}" + prefix = "distances_pairwise" """ - profile_dists --query $query --ref $ref $args --outfmt $mapping_format \\ + profile_dists --query $query --ref $ref $args \\ + --outfmt pairwise \\ --distm $params.pd_distm \\ --file_type $params.pd_file_type \\ --missing_thresh $params.pd_missing_threshold \\ diff --git a/nextflow.config b/nextflow.config index 31a7a17..9734b66 100644 --- a/nextflow.config +++ b/nextflow.config @@ -44,7 +44,6 @@ params { validate_params = true // Profile Dists - pd_outfmt = "pairwise" pd_distm = "hamming" pd_missing_threshold = 1.0 pd_sample_quality_threshold = 1.0 diff --git a/nextflow_schema.json b/nextflow_schema.json index 99b1c29..b2b8a89 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -37,13 +37,6 @@ "description": "", "default": "", "properties": { - "pd_outfmt": { - "type": "string", - "description": "The output format for distances", - "enum": ["pairwise"], - "default": "pairwise", - "hidden": true - }, "pd_distm": { "type": "string", "description": "The distance method/unit", diff --git a/workflows/gas_nomenclature.nf b/workflows/gas_nomenclature.nf index 4dd9d5f..813de21 100644 --- a/workflows/gas_nomenclature.nf +++ b/workflows/gas_nomenclature.nf @@ -123,11 +123,8 @@ workflow GAS_NOMENCLATURE { exit 1, "${params.pd_columns}: Does not exist but was passed to the pipeline. Exiting now." } - mapping_format = Channel.value(params.pd_outfmt) - distances = PROFILE_DISTS(merged_queries.combined_profiles, merged_references.combined_profiles, - mapping_format, mapping_file, columns_file) ch_versions = ch_versions.mix(distances.versions) From 07fe2c66dee1631279630a3bdda33281f30b4b2b Mon Sep 17 00:00:00 2001 From: kylacochrane Date: Mon, 10 Jun 2024 16:52:24 -0400 Subject: [PATCH 081/119] Update python script name to match process: input_assure.py --- bin/{input_check.py => input_assure.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename bin/{input_check.py => input_assure.py} (100%) diff --git a/bin/input_check.py b/bin/input_assure.py similarity index 100% rename from bin/input_check.py rename to bin/input_assure.py From 23c1397efd2c31f8af2588a846809af16a81a0fc Mon Sep 17 00:00:00 2001 From: kylacochrane Date: Mon, 10 Jun 2024 16:54:35 -0400 Subject: [PATCH 082/119] Add 'fair = true' to input_assure process in modules.config for reproducibility --- conf/modules.config | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index a898c53..00855c7 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -13,8 +13,6 @@ process { // Publish directory names - assembly_directory_name = "assembly" - summary_directory_name = "summary" profile_dists_directory_name = "distances" gas_call_directory_name = "call" @@ -27,6 +25,10 @@ process { saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] + withName: INPUT_ASSURE { + fair = true + } + withName: LOCIDEX_MERGE_REF { publishDir = [ path: locidex_merge_ref_directory_name, From c7252cfeb77a4f32fce0f0811e711961ce07b19a Mon Sep 17 00:00:00 2001 From: kylacochrane Date: Mon, 10 Jun 2024 16:56:49 -0400 Subject: [PATCH 083/119] Update input_assure.py to include additional check for multiple keys --- bin/input_assure.py | 52 +++++++++++++++++++----------- modules/local/input_assure/main.nf | 6 ++-- workflows/gas_nomenclature.nf | 10 +++--- 3 files changed, 42 insertions(+), 26 deletions(-) diff --git a/bin/input_assure.py b/bin/input_assure.py index 5c8365d..2705eae 100755 --- a/bin/input_assure.py +++ b/bin/input_assure.py @@ -2,7 +2,6 @@ import json import argparse -import sys import csv import gzip @@ -14,31 +13,48 @@ def open_file(file_path, mode): return open(file_path, mode) def check_inputs(json_file, sample_id, address, output_error_file): - # Define a variable to store the match_status (True or False) - with open(json_file, "rt") as f: + with open_file(json_file, "rt") as f: json_data = json.load(f) - match_status = sample_id in json_data - # Define the original key in the JSON data - original_key = list(json_data.keys())[0] + # Define a variable to store the match_status (True or False) + match_status = sample_id in json_data + + keys = list (json_data.keys()) + original_key = keys[0] + # Initialize the error message + error_message = None + + # Check for multiple keys in the JSON file and define error message + if len(keys) > 1: + # Check if sample_id matches any key + if not match_status: + error_message = f"No key in the MLST JSON file ({json_file}) matches the specified sample ID '{sample_id}'. The first key '{original_key}' has been forcefully changed to '{sample_id}' and all other keys have been removed." + # Retain only the specified sample ID + json_data = {sample_id: json_data.pop(original_key)} + else: + error_message = f"MLST JSON file ({json_file}) contains multiple keys: {keys}. The MLST JSON file has been modified to retain only the '{sample_id}' entry" + # Remove all keys expect the one matching sample_id + json_data = {sample_id: json_data[sample_id]} + elif not match_status: # Define error message based on meta.address (query or reference) - if address == "null": - error_message = f"Query {sample_id} ID and JSON key in {json_file} DO NOT MATCH. The '{original_key}' key in {json_file} has been forcefully changed to '{sample_id}': User should manually check input files to ensure correctness." - else: - error_message = f"Reference {sample_id} ID and JSON key in {json_file} DO NOT MATCH. The '{original_key}' key in {json_file} has been forcefully changed to '{sample_id}': User should manually check input files to ensure correctness." - - # Write sample ID and JSON key to error report CSV if not matched; include error message - if not match_status: + if address == "null": + error_message = f"Query {sample_id} ID and JSON key in {json_file} DO NOT MATCH. The '{original_key}' key in {json_file} has been forcefully changed to '{sample_id}': User should manually check input files to ensure correctness." + else: + error_message = f"Reference {sample_id} ID and JSON key in {json_file} DO NOT MATCH. The '{original_key}' key in {json_file} has been forcefully changed to '{sample_id}': User should manually check input files to ensure correctness." + # Update the JSON file with the new sample ID + json_data[sample_id] = json_data.pop(original_key) + + # Write file containing relevant error messages + if error_message: with open(output_error_file, "w", newline="") as f: writer = csv.writer(f) writer.writerow(["sample", "JSON_key", "error_message"]) - writer.writerow([sample_id, original_key, error_message]) + writer.writerow([sample_id, keys, error_message]) - # Update the JSON file with the new sample ID - json_data[sample_id] = json_data.pop(original_key) - with open(json_file, "wt") as f: - json.dump(json_data, f, indent=4) + # Write the updated JSON data back to the original file + with open_file(json_file, "wt") as f: + json.dump(json_data, f, indent=4) if __name__ == "__main__": parser = argparse.ArgumentParser( diff --git a/modules/local/input_assure/main.nf b/modules/local/input_assure/main.nf index c01c319..e0376ac 100644 --- a/modules/local/input_assure/main.nf +++ b/modules/local/input_assure/main.nf @@ -1,5 +1,5 @@ process INPUT_ASSURE { - tag "Check Sample Inputs and Generate Error Report" + tag "Assures Inputs are Consistent" label 'process_single' container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? @@ -10,14 +10,14 @@ process INPUT_ASSURE { tuple val(meta), path(mlst) output: - tuple val(meta), path(mlst), emit: match + tuple val(meta), path(mlst), emit: result tuple val(meta), path("*_error_report.csv"), optional: true, emit: error_report path("versions.yml"), emit: versions script: """ - input_check.py \\ + input_assure.py \\ --input ${mlst} \\ --sample_id ${meta.id} \\ --address ${meta.address} \\ diff --git a/workflows/gas_nomenclature.nf b/workflows/gas_nomenclature.nf index d527777..4531ff1 100644 --- a/workflows/gas_nomenclature.nf +++ b/workflows/gas_nomenclature.nf @@ -72,15 +72,15 @@ workflow GAS_NOMENCLATURE { input = Channel.fromSamplesheet("input") // Ensure meta.id and mlst_file keys match; generate error report for samples where id ≠ key - id_key = INPUT_ASSURE(input) - ch_versions = ch_versions.mix(id_key.versions) + input_assure = INPUT_ASSURE(input) + ch_versions = ch_versions.mix(input_assure.versions) // Prepare reference and query TSV files for LOCIDEX_MERGE - profiles = id_key.match.branch { + profiles = input_assure.result.branch { query: !it[0].address } - reference_values = input.collect{ meta, profile -> profile} - query_values = profiles.query.collect{ meta, profile -> profile } + reference_values = input_assure.result.collect{ meta, mlst -> mlst} + query_values = profiles.query.collect{ meta, mlst -> mlst } // LOCIDEX modules ref_tag = Channel.value("ref") From f7ed9d3325e78da2a394d9f5d96b70e10f904a33 Mon Sep 17 00:00:00 2001 From: kylacochrane Date: Mon, 10 Jun 2024 16:59:14 -0400 Subject: [PATCH 084/119] Fixed linting issues --- bin/input_assure.py | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/bin/input_assure.py b/bin/input_assure.py index 2705eae..779e888 100755 --- a/bin/input_assure.py +++ b/bin/input_assure.py @@ -5,46 +5,48 @@ import csv import gzip + def open_file(file_path, mode): # Open a file based on the file extension - if file_path.endswith('.gz'): + if file_path.endswith(".gz"): return gzip.open(file_path, mode) else: return open(file_path, mode) + def check_inputs(json_file, sample_id, address, output_error_file): with open_file(json_file, "rt") as f: json_data = json.load(f) # Define a variable to store the match_status (True or False) match_status = sample_id in json_data - - keys = list (json_data.keys()) + + keys = list(json_data.keys()) original_key = keys[0] # Initialize the error message - error_message = None - + error_message = None + # Check for multiple keys in the JSON file and define error message if len(keys) > 1: # Check if sample_id matches any key if not match_status: error_message = f"No key in the MLST JSON file ({json_file}) matches the specified sample ID '{sample_id}'. The first key '{original_key}' has been forcefully changed to '{sample_id}' and all other keys have been removed." - # Retain only the specified sample ID + # Retain only the specified sample ID json_data = {sample_id: json_data.pop(original_key)} else: error_message = f"MLST JSON file ({json_file}) contains multiple keys: {keys}. The MLST JSON file has been modified to retain only the '{sample_id}' entry" # Remove all keys expect the one matching sample_id - json_data = {sample_id: json_data[sample_id]} + json_data = {sample_id: json_data[sample_id]} elif not match_status: - # Define error message based on meta.address (query or reference) + # Define error message based on meta.address (query or reference) if address == "null": error_message = f"Query {sample_id} ID and JSON key in {json_file} DO NOT MATCH. The '{original_key}' key in {json_file} has been forcefully changed to '{sample_id}': User should manually check input files to ensure correctness." else: error_message = f"Reference {sample_id} ID and JSON key in {json_file} DO NOT MATCH. The '{original_key}' key in {json_file} has been forcefully changed to '{sample_id}': User should manually check input files to ensure correctness." # Update the JSON file with the new sample ID json_data[sample_id] = json_data.pop(original_key) - + # Write file containing relevant error messages if error_message: with open(output_error_file, "w", newline="") as f: @@ -52,10 +54,11 @@ def check_inputs(json_file, sample_id, address, output_error_file): writer.writerow(["sample", "JSON_key", "error_message"]) writer.writerow([sample_id, keys, error_message]) - # Write the updated JSON data back to the original file + # Write the updated JSON data back to the original file with open_file(json_file, "wt") as f: json.dump(json_data, f, indent=4) + if __name__ == "__main__": parser = argparse.ArgumentParser( description="Check sample inputs, force change if ID ≠ KEY, and generate an error report." @@ -73,6 +76,4 @@ def check_inputs(json_file, sample_id, address, output_error_file): args = parser.parse_args() - check_inputs( - args.input, args.sample_id, args.address, args.output_error - ) + check_inputs(args.input, args.sample_id, args.address, args.output_error) From 6d76d501cc0f8a5031c5ce134f9e82b6f040cb81 Mon Sep 17 00:00:00 2001 From: kylacochrane Date: Tue, 11 Jun 2024 11:55:15 -0400 Subject: [PATCH 085/119] Update filter_query to accomodate multiple queries --- modules/local/filter_query/main.nf | 12 +++++++----- workflows/gas_nomenclature.nf | 4 +++- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/modules/local/filter_query/main.nf b/modules/local/filter_query/main.nf index 5bb4b17..9912ee5 100644 --- a/modules/local/filter_query/main.nf +++ b/modules/local/filter_query/main.nf @@ -7,7 +7,7 @@ process FILTER_QUERY { 'biocontainers/csvtk:0.22.0--h9ee0642_1' }" input: - val input_query + val query_ids path addresses val in_format val out_format @@ -17,19 +17,19 @@ process FILTER_QUERY { path("versions.yml"), emit: versions script: - - def queryID = input_query[0].id def outputFile = "new_addresses" - def delimiter = in_format == "tsv" ? "\t" : (in_format == "csv" ? "," : in_format) def out_delimiter = out_format == "tsv" ? "\t" : (out_format == "csv" ? "," : out_format) def out_extension = out_format == "tsv" ? 'tsv' : 'csv' + // Join the query IDs in the correct csvtk filter2 required format + def queryID = query_ids.collect { id -> "\$id == \"${id}\"" }.join(" || ") + """ # Filter the query samples only; keep only the 'id' and 'address' columns csvtk filter2 \\ ${addresses} \\ - --filter '\$id == \"$queryID\"' \\ + --filter '$queryID' \\ --delimiter "${delimiter}" \\ --out-delimiter "${out_delimiter}" | \\ csvtk cut -f id,address > ${outputFile}.${out_extension} @@ -39,5 +39,7 @@ process FILTER_QUERY { csvtk: \$(echo \$( csvtk version | sed -e "s/csvtk v//g" )) END_VERSIONS """ + + } diff --git a/workflows/gas_nomenclature.nf b/workflows/gas_nomenclature.nf index 813de21..9a8913f 100644 --- a/workflows/gas_nomenclature.nf +++ b/workflows/gas_nomenclature.nf @@ -142,7 +142,9 @@ workflow GAS_NOMENCLATURE { ch_versions = ch_versions.mix(called_data.versions) // Filter the new queried samples and addresses into a CSV/JSON file for the IRIDANext plug in - new_addresses = FILTER_QUERY(profiles.query, called_data.distances, "tsv", "csv") + query_ids = profiles.query.collect { it[0].id } + + new_addresses = FILTER_QUERY(query_ids, called_data.distances, "tsv", "csv") ch_versions = ch_versions.mix(new_addresses.versions) CUSTOM_DUMPSOFTWAREVERSIONS ( From 59f4d176dedfdad7e144b48e30cc5a4249159a5a Mon Sep 17 00:00:00 2001 From: kylacochrane Date: Tue, 11 Jun 2024 11:57:59 -0400 Subject: [PATCH 086/119] Add test for multiple_queries and supporting files --- .../data/called/expected_results_queries.txt | 6 ++ .../data/irida/queries_iridanext.output.json | 16 +++++ .../profiles/expected-profile_queries1.tsv | 6 ++ .../profiles/expected-profile_queries2.tsv | 3 + tests/data/reports/sampleN.mlst.json | 7 +++ .../samplesheet-multiple_queries.csv | 6 ++ tests/pipelines/main.nf.test | 60 +++++++++++++++++++ 7 files changed, 104 insertions(+) create mode 100644 tests/data/called/expected_results_queries.txt create mode 100644 tests/data/irida/queries_iridanext.output.json create mode 100644 tests/data/profiles/expected-profile_queries1.tsv create mode 100644 tests/data/profiles/expected-profile_queries2.tsv create mode 100644 tests/data/reports/sampleN.mlst.json create mode 100644 tests/data/samplesheets/samplesheet-multiple_queries.csv diff --git a/tests/data/called/expected_results_queries.txt b/tests/data/called/expected_results_queries.txt new file mode 100644 index 0000000..f5e5ae4 --- /dev/null +++ b/tests/data/called/expected_results_queries.txt @@ -0,0 +1,6 @@ +id address level_1 level_2 level_3 +sample1 1.1.1 1 1 1 +sample2 1.1.1 1 1 1 +sample3 1.1.2 1 1 2 +sampleQ 2.2.3 2 2 3 +sampleN 2.2.3 2 2 3 diff --git a/tests/data/irida/queries_iridanext.output.json b/tests/data/irida/queries_iridanext.output.json new file mode 100644 index 0000000..7063e8e --- /dev/null +++ b/tests/data/irida/queries_iridanext.output.json @@ -0,0 +1,16 @@ +{ + "files": { + "global": [], + "samples": {} + }, + "metadata": { + "samples": { + "sampleQ": { + "address": "2.2.3" + }, + "sampleN": { + "address": "2.2.3" + } + } + } +} diff --git a/tests/data/profiles/expected-profile_queries1.tsv b/tests/data/profiles/expected-profile_queries1.tsv new file mode 100644 index 0000000..b2f8100 --- /dev/null +++ b/tests/data/profiles/expected-profile_queries1.tsv @@ -0,0 +1,6 @@ +sample_id l1 l2 l3 +sampleQ 1 2 1 +sampleN 1 2 1 +sample1 1 1 1 +sample2 1 1 1 +sample3 1 1 2 diff --git a/tests/data/profiles/expected-profile_queries2.tsv b/tests/data/profiles/expected-profile_queries2.tsv new file mode 100644 index 0000000..4b4d059 --- /dev/null +++ b/tests/data/profiles/expected-profile_queries2.tsv @@ -0,0 +1,3 @@ +sample_id l1 l2 l3 +sampleQ 1 2 1 +sampleN 1 2 1 diff --git a/tests/data/reports/sampleN.mlst.json b/tests/data/reports/sampleN.mlst.json new file mode 100644 index 0000000..178b6db --- /dev/null +++ b/tests/data/reports/sampleN.mlst.json @@ -0,0 +1,7 @@ +{ + "sampleN": { + "l1": "1", + "l2": "2", + "l3": "1" + } +} diff --git a/tests/data/samplesheets/samplesheet-multiple_queries.csv b/tests/data/samplesheets/samplesheet-multiple_queries.csv new file mode 100644 index 0000000..eb661ca --- /dev/null +++ b/tests/data/samplesheets/samplesheet-multiple_queries.csv @@ -0,0 +1,6 @@ +sample,mlst_alleles,address +sampleQ,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sampleQ.mlst.json, +sampleN,/root/working_directory/nml-phac/gasnomenclature/tests/data/reports/sampleN.mlst.json, +sample1,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample1.mlst.json,1.1.1 +sample2,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample2.mlst.json,1.1.1 +sample3,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample3.mlst.json,1.1.2 diff --git a/tests/pipelines/main.nf.test b/tests/pipelines/main.nf.test index e35a9c3..e3a467a 100644 --- a/tests/pipelines/main.nf.test +++ b/tests/pipelines/main.nf.test @@ -32,6 +32,11 @@ nextflow_pipeline { def expected_distances = path("$baseDir/tests/data/distances/expected_pairwise_dists.txt") assert actual_distances.text == expected_distances.text + // Verify cluster file + def actual_cluster = path("$launchDir/results/cluster/expected_clusters.txt") + def expected_cluster = path("$baseDir/tests/data/clusters/expected_clusters.txt") + assert actual_cluster.text == expected_cluster.text + // Check called clusters def actual_calls = path("$launchDir/results/call/Called/results.text") def expected_calls = path("$baseDir/tests/data/called/expected_results.txt") @@ -49,6 +54,61 @@ nextflow_pipeline { } } + test("Small-scale test of full pipeline with multiple queries"){ + tag "pipeline_success_multiple_queries" + + when{ + params { + input = "$baseDir/tests/data/samplesheets/samplesheet-multiple_queries.csv" + outdir = "results" + } + } + + then { + assert workflow.success + assert path("$launchDir/results").exists() + + // Check merged profiles + def actual_profile_ref = path("$launchDir/results/locidex/merge/reference/merged_ref/merged_profiles_ref.tsv") + def expected_profile_tsv = path("$baseDir/tests/data/profiles/expected-profile_queries1.tsv") + assert actual_profile_ref.text == expected_profile_tsv.text + + // Check query profiles + def actual_profile_query = path("$launchDir/results/locidex/merge/query/merged_value/merged_profiles_value.tsv") + def expected_profile_query_tsv = path("$baseDir/tests/data/profiles/expected-profile_queries2.tsv") + assert actual_profile_query.text == expected_profile_query_tsv.text + + // Check computed pairwise distances + def actual_distances = path("$launchDir/results/distances/results.text") + def expected_distances = path("$baseDir/tests/data/distances/expected_pairwise_queries_dists.txt") + assert actual_distances.text == expected_distances.text + + // Verify cluster file + def actual_cluster = path("$launchDir/results/cluster/expected_clusters.txt") + def expected_cluster = path("$baseDir/tests/data/clusters/expected_clusters.txt") + assert actual_cluster.text == expected_cluster.text + + // Check called clusters + def actual_calls = path("$launchDir/results/call/Called/results.text") + def expected_calls = path("$baseDir/tests/data/called/expected_results_queries.txt") + assert actual_calls.text == expected_calls.text + + // Check IRIDA Next JSON output + assert path("$launchDir/results/iridanext.output.json").json == path("$baseDir/tests/data/irida/queries_iridanext.output.json").json + + def iridanext_json = path("$launchDir/results/iridanext.output.json").json + def iridanext_samples = iridanext_json.files.samples + def iridanext_metadata = iridanext_json.metadata.samples + + assert iridanext_metadata.size() == 2 + assert iridanext_metadata.containsKey("sampleQ") + assert iridanext_metadata.containsKey("sampleN") + + assert iridanext_metadata.sampleQ."address" == "2.2.3" + assert iridanext_metadata.sampleN.address == "2.2.3" + } + } + test("Integration test where input contains reference sample with mismatched MLST JSON file"){ tag "pipeline_failure" From 0742c50e81c6296209215ea8044061d65101f802 Mon Sep 17 00:00:00 2001 From: kylacochrane Date: Tue, 11 Jun 2024 11:58:43 -0400 Subject: [PATCH 087/119] Implement 'fair true' in input_check to insure consistent ordering of samples --- modules/local/input_check/main.nf | 1 + 1 file changed, 1 insertion(+) diff --git a/modules/local/input_check/main.nf b/modules/local/input_check/main.nf index 79a2242..762aeae 100644 --- a/modules/local/input_check/main.nf +++ b/modules/local/input_check/main.nf @@ -1,6 +1,7 @@ process INPUT_CHECK{ tag "Check Sample Inputs and Generate Error Report" label 'process_single' + fair true container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/python:3.8.3' : From d1c9809e48e9e61431ce379e4f64c951737d8bc4 Mon Sep 17 00:00:00 2001 From: kylacochrane Date: Tue, 11 Jun 2024 13:09:56 -0400 Subject: [PATCH 088/119] Update file path in samplesheet --- tests/data/samplesheets/samplesheet-multiple_queries.csv | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/data/samplesheets/samplesheet-multiple_queries.csv b/tests/data/samplesheets/samplesheet-multiple_queries.csv index eb661ca..e429a1c 100644 --- a/tests/data/samplesheets/samplesheet-multiple_queries.csv +++ b/tests/data/samplesheets/samplesheet-multiple_queries.csv @@ -1,6 +1,6 @@ sample,mlst_alleles,address sampleQ,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sampleQ.mlst.json, -sampleN,/root/working_directory/nml-phac/gasnomenclature/tests/data/reports/sampleN.mlst.json, +sampleN,https://raw.githubusercontent.com/phac-nml/gasnomenclature/update-filter_query/tests/data/reports/sampleN.mlst.json, sample1,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample1.mlst.json,1.1.1 sample2,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample2.mlst.json,1.1.1 sample3,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample3.mlst.json,1.1.2 From 07b6c3c87f8f5465a98b8a35aa05a7e534e5a8b2 Mon Sep 17 00:00:00 2001 From: kylacochrane Date: Tue, 11 Jun 2024 14:02:40 -0400 Subject: [PATCH 089/119] Added missing expected test file --- .../distances/expected_pairwise_queries_dists.txt | 11 +++++++++++ 1 file changed, 11 insertions(+) create mode 100644 tests/data/distances/expected_pairwise_queries_dists.txt diff --git a/tests/data/distances/expected_pairwise_queries_dists.txt b/tests/data/distances/expected_pairwise_queries_dists.txt new file mode 100644 index 0000000..44aa848 --- /dev/null +++ b/tests/data/distances/expected_pairwise_queries_dists.txt @@ -0,0 +1,11 @@ +query_id ref_id dist +sampleQ sampleQ 0 +sampleQ sampleN 0 +sampleQ sample1 1 +sampleQ sample2 1 +sampleQ sample3 2 +sampleN sampleQ 0 +sampleN sampleN 0 +sampleN sample1 1 +sampleN sample2 1 +sampleN sample3 2 From 15b7090691f90dbb969be9db01218715673201ca Mon Sep 17 00:00:00 2001 From: kylacochrane Date: Wed, 12 Jun 2024 14:24:10 -0400 Subject: [PATCH 090/119] Changed cluster_file output filename for clarity --- modules/local/cluster_file/main.nf | 4 ++-- tests/modules/cluster_file/main.nf.test | 4 ++-- tests/pipelines/main.nf.test | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/modules/local/cluster_file/main.nf b/modules/local/cluster_file/main.nf index dfb8004..0a97545 100644 --- a/modules/local/cluster_file/main.nf +++ b/modules/local/cluster_file/main.nf @@ -6,7 +6,7 @@ process CLUSTER_FILE { val meta output: - path("expected_clusters.txt"), emit: text + path("reference_clusters.txt"), emit: text exec: def outputLines = [] @@ -37,7 +37,7 @@ process CLUSTER_FILE { } // Write the text file, iterating over each sample - task.workDir.resolve("expected_clusters.txt").withWriter { writer -> + task.workDir.resolve("reference_clusters.txt").withWriter { writer -> outputLines.each { line -> writer.writeLine(line) } diff --git a/tests/modules/cluster_file/main.nf.test b/tests/modules/cluster_file/main.nf.test index 43fd71c..3f13833 100644 --- a/tests/modules/cluster_file/main.nf.test +++ b/tests/modules/cluster_file/main.nf.test @@ -25,8 +25,8 @@ nextflow_process { assert process.success assert path("$launchDir/cluster_results").exists() - // Check expected_clusters - def actual_clusters = path("$launchDir/cluster_results/cluster/expected_clusters.txt") + // Check reference_clusters file + def actual_clusters = path("$launchDir/cluster_results/cluster/reference_clusters.txt") def expected_clusters = path("$baseDir/tests/data/clusters/expected_clusters.txt") assert actual_clusters.text == expected_clusters.text } diff --git a/tests/pipelines/main.nf.test b/tests/pipelines/main.nf.test index e3a467a..53ad3d1 100644 --- a/tests/pipelines/main.nf.test +++ b/tests/pipelines/main.nf.test @@ -33,7 +33,7 @@ nextflow_pipeline { assert actual_distances.text == expected_distances.text // Verify cluster file - def actual_cluster = path("$launchDir/results/cluster/expected_clusters.txt") + def actual_cluster = path("$launchDir/results/cluster/reference_clusters.txt") def expected_cluster = path("$baseDir/tests/data/clusters/expected_clusters.txt") assert actual_cluster.text == expected_cluster.text @@ -84,7 +84,7 @@ nextflow_pipeline { assert actual_distances.text == expected_distances.text // Verify cluster file - def actual_cluster = path("$launchDir/results/cluster/expected_clusters.txt") + def actual_cluster = path("$launchDir/results/cluster/reference_clusters.txt") def expected_cluster = path("$baseDir/tests/data/clusters/expected_clusters.txt") assert actual_cluster.text == expected_cluster.text From 7592bd3bea310522d45de6725804a6f3093050c5 Mon Sep 17 00:00:00 2001 From: kylacochrane Date: Wed, 12 Jun 2024 15:13:49 -0400 Subject: [PATCH 091/119] Resolve conflicts between dev and input_assure --- modules/local/input_assure/main.nf | 1 - 1 file changed, 1 deletion(-) diff --git a/modules/local/input_assure/main.nf b/modules/local/input_assure/main.nf index 30dfdaa..e0376ac 100644 --- a/modules/local/input_assure/main.nf +++ b/modules/local/input_assure/main.nf @@ -1,7 +1,6 @@ process INPUT_ASSURE { tag "Assures Inputs are Consistent" label 'process_single' - fair true container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/python:3.8.3' : From 32663302aa75ba71c3b606c08b2ec62e3dea3c03 Mon Sep 17 00:00:00 2001 From: kylacochrane Date: Wed, 12 Jun 2024 16:08:44 -0400 Subject: [PATCH 092/119] Add test with gzipped MLST JSON file --- tests/data/reports/sample1.mlst.json.gz | Bin 0 -> 84 bytes tests/data/samplesheets/samplesheet_gzip.csv | 5 +++ tests/pipelines/main.nf.test | 35 +++++++++++++++++++ 3 files changed, 40 insertions(+) create mode 100644 tests/data/reports/sample1.mlst.json.gz create mode 100644 tests/data/samplesheets/samplesheet_gzip.csv diff --git a/tests/data/reports/sample1.mlst.json.gz b/tests/data/reports/sample1.mlst.json.gz new file mode 100644 index 0000000000000000000000000000000000000000..94f25c81407bfee01777e59fb4af80f5998dbf30 GIT binary patch literal 84 zcmb2|=HNK`KQoQ#e{o`NK~AcnUT#itiC$K5ejdZBXF&{1OCKG%(=;c{ Date: Wed, 12 Jun 2024 16:56:07 -0400 Subject: [PATCH 093/119] Added test for mismatched IDs --- .../irida/mismatched_iridanext.output.json | 29 ++++++++ .../samplesheet-mismatched_IDs.csv | 7 ++ tests/pipelines/main.nf.test | 66 +++++++------------ 3 files changed, 61 insertions(+), 41 deletions(-) create mode 100644 tests/data/irida/mismatched_iridanext.output.json create mode 100644 tests/data/samplesheets/samplesheet-mismatched_IDs.csv diff --git a/tests/data/irida/mismatched_iridanext.output.json b/tests/data/irida/mismatched_iridanext.output.json new file mode 100644 index 0000000..ec418dc --- /dev/null +++ b/tests/data/irida/mismatched_iridanext.output.json @@ -0,0 +1,29 @@ +{ + "files": { + "global": [ + + ], + "samples": { + "sampleR": [ + { + "path": "input/sampleR_error_report.csv" + } + ], + "sample2": [ + { + "path": "input/sample2_error_report.csv" + } + ] + } + }, + "metadata": { + "samples": { + "sampleQ": { + "address": "2.2.3" + }, + "sampleR": { + "address": "2.2.3" + } + } + } +} \ No newline at end of file diff --git a/tests/data/samplesheets/samplesheet-mismatched_IDs.csv b/tests/data/samplesheets/samplesheet-mismatched_IDs.csv new file mode 100644 index 0000000..73230d4 --- /dev/null +++ b/tests/data/samplesheets/samplesheet-mismatched_IDs.csv @@ -0,0 +1,7 @@ +sample,mlst_alleles,address +sampleQ,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sampleQ.mlst.json, +sampleR,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sampleF.mlst.json, +sample1,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample1.mlst.json,1.1.1 +sample2,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample7.mlst.json,1.1.1 +sample3,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample3.mlst.json,1.1.2 + diff --git a/tests/pipelines/main.nf.test b/tests/pipelines/main.nf.test index d47b7ed..9892b40 100644 --- a/tests/pipelines/main.nf.test +++ b/tests/pipelines/main.nf.test @@ -105,12 +105,12 @@ nextflow_pipeline { assert iridanext_metadata.containsKey("sampleN") assert iridanext_metadata.sampleQ."address" == "2.2.3" - assert iridanext_metadata.sampleN.address == "2.2.3" + assert iridanext_metadata.sampleN."address" == "2.2.3" } } test("Small-scale test of full pipeline with gzipped MLST JSON"){ - tag "pipeline_success_gzipped_JSON" + tag "Gzipped_MLST_JSON" when{ params { @@ -144,37 +144,15 @@ nextflow_pipeline { } } - test("Integration test where input contains reference sample with mismatched MLST JSON file"){ - tag "pipeline_failure" - - when { - params { - input = "$baseDir/tests/data/samplesheets/samplesheet_test1.csv" - outdir = "results" - } - } - - then { - assert workflow.failed - assert (workflow.stdout =~ /Pipeline exiting: sample with ID sample2 does not have matching MLST JSON file./).find() - - assert path("$launchDir/results").exists() - assert path("$launchDir/results/input").exists() - - // Ensure that despite pipeline failure, error_reports are generated for all samples added to pipeline (i.e. sampleQ query) - def lines = [] - - lines = path("$launchDir/results/input/sample2_error_report.csv").readLines() - assert lines.contains("sample2,sample7,Pipeline stopped: Reference sample2's input ID and MLST JSON file key DO NOT MATCH") - } - } - - test("Integration test where input contains a single query sample with mismatched MLST JSON file"){ - tag "pipeline_success_after_query_removal" + test("Testing when query and reference sample IDs are mismatched with MLST JSON file keys"){ + // IDs in the sample sheet and IDs in the individual MLST JSON files will not match. + // This tests the pipelines ability to handle and correct for this problem. + + tag "mismatched_IDs" when{ params { - input = "$baseDir/tests/data/samplesheets/samplesheet_test2.csv" + input = "$baseDir/tests/data/samplesheets/samplesheet-mismatched_IDs.csv" outdir = "results" } } @@ -182,29 +160,35 @@ nextflow_pipeline { then { assert workflow.success assert path("$launchDir/results").exists() - assert path("$launchDir/results/input").exists() - assert path("$launchDir/results/filter").exists() - + // Check outputs def lines = [] - // Ensure that the error_report is generated for removed query sampleR + // Ensure that the error_reports are generated for query and reference samples + lines = path("$launchDir/results/input/sample2_error_report.csv").readLines() + assert lines.contains("sample2,[\'sample7\'],Reference sample2 ID and JSON key in sample7.mlst.json DO NOT MATCH. The 'sample7' key in sample7.mlst.json has been forcefully changed to 'sample2': User should manually check input files to ensure correctness.") + lines = path("$launchDir/results/input/sampleR_error_report.csv").readLines() - assert lines.contains("sampleR,sampleF,Query sampleR removed from pipeline") + assert lines.contains("sampleR,[\'sampleF\'],Query sampleR ID and JSON key in sampleF.mlst.json DO NOT MATCH. The 'sampleF' key in sampleF.mlst.json has been forcefully changed to 'sampleR': User should manually check input files to ensure correctness.") - // Check query output csv + // Check filter_query csv file lines = path("$launchDir/results/filter/new_addresses.csv").readLines() - assert lines.contains("sampleQ,1.1.3") + assert lines.contains("sampleQ,2.2.3") + assert lines.contains("sampleR,2.2.3") - // Check IRIDA Next JSON output - assert path("$launchDir/results/iridanext.output.json").json == path("$baseDir/tests/data/irida/test2_iridanext.output.json").json + // Check IRIDA Next JSON output + assert path("$launchDir/results/iridanext.output.json").json == path("$baseDir/tests/data/irida/mismatched_iridanext.output.json").json def iridanext_json = path("$launchDir/results/iridanext.output.json").json def iridanext_samples = iridanext_json.files.samples def iridanext_metadata = iridanext_json.metadata.samples - assert iridanext_samples.sampleR.findAll { it.path == "input/sampleR_error_report.csv" }.size() == 1 - assert iridanext_metadata.sampleQ."address" == "1.1.3" + assert iridanext_metadata.size() == 2 + assert iridanext_metadata.containsKey("sampleQ") + assert iridanext_metadata.containsKey("sampleR") + + assert iridanext_metadata.sampleQ."address" == "2.2.3" + assert iridanext_metadata.sampleR."address" == "2.2.3" } } From 001709087fdbb07d23aa25193d2379dea17dcbe3 Mon Sep 17 00:00:00 2001 From: kylacochrane Date: Wed, 12 Jun 2024 16:59:27 -0400 Subject: [PATCH 094/119] Update paths in samplesheet --- tests/data/samplesheets/samplesheet_gzip.csv | 2 +- tests/pipelines/main.nf.test | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/data/samplesheets/samplesheet_gzip.csv b/tests/data/samplesheets/samplesheet_gzip.csv index a4b5bad..2337c78 100644 --- a/tests/data/samplesheets/samplesheet_gzip.csv +++ b/tests/data/samplesheets/samplesheet_gzip.csv @@ -1,5 +1,5 @@ sample,mlst_alleles,address sampleQ,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sampleQ.mlst.json, -sample1,/root/working_directory/gas/gasnomenclature/tests/data/reports/sample1.mlst.json.gz,1.1.1 +sample1,https://raw.githubusercontent.com/phac-nml/gasnomenclature/input_assure/tests/data/reports/sample1.mlst.json.gz,1.1.1 sample2,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample2.mlst.json,1.1.1 sample3,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample3.mlst.json,1.1.2 diff --git a/tests/pipelines/main.nf.test b/tests/pipelines/main.nf.test index 9892b40..77c2672 100644 --- a/tests/pipelines/main.nf.test +++ b/tests/pipelines/main.nf.test @@ -166,7 +166,7 @@ nextflow_pipeline { // Ensure that the error_reports are generated for query and reference samples lines = path("$launchDir/results/input/sample2_error_report.csv").readLines() - assert lines.contains("sample2,[\'sample7\'],Reference sample2 ID and JSON key in sample7.mlst.json DO NOT MATCH. The 'sample7' key in sample7.mlst.json has been forcefully changed to 'sample2': User should manually check input files to ensure correctness.") + #assert lines.contains("sample2,[\'sample7\'],Reference sample2 ID and JSON key in sample7.mlst.json DO NOT MATCH. The 'sample7' key in sample7.mlst.json has been forcefully changed to 'sample2': User should manually check input files to ensure correctness.") lines = path("$launchDir/results/input/sampleR_error_report.csv").readLines() assert lines.contains("sampleR,[\'sampleF\'],Query sampleR ID and JSON key in sampleF.mlst.json DO NOT MATCH. The 'sampleF' key in sampleF.mlst.json has been forcefully changed to 'sampleR': User should manually check input files to ensure correctness.") From 3f181eb99c1e458977b03abdbdc83dad2ba2610e Mon Sep 17 00:00:00 2001 From: kylacochrane Date: Wed, 12 Jun 2024 17:00:48 -0400 Subject: [PATCH 095/119] Fix EC issues --- tests/pipelines/main.nf.test | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/pipelines/main.nf.test b/tests/pipelines/main.nf.test index 77c2672..43449ea 100644 --- a/tests/pipelines/main.nf.test +++ b/tests/pipelines/main.nf.test @@ -126,7 +126,7 @@ nextflow_pipeline { // Check is sample1.mlst.json.gz exists and is gzipped def gzipped_json = path("$launchDir/results/input/sample1.mlst.json.gz") assert gzipped_json.exists() - + // Check called clusters def actual_calls = path("$launchDir/results/call/Called/results.text") def expected_calls = path("$baseDir/tests/data/called/expected_results.txt") @@ -147,7 +147,7 @@ nextflow_pipeline { test("Testing when query and reference sample IDs are mismatched with MLST JSON file keys"){ // IDs in the sample sheet and IDs in the individual MLST JSON files will not match. // This tests the pipelines ability to handle and correct for this problem. - + tag "mismatched_IDs" when{ @@ -160,14 +160,14 @@ nextflow_pipeline { then { assert workflow.success assert path("$launchDir/results").exists() - + // Check outputs def lines = [] // Ensure that the error_reports are generated for query and reference samples lines = path("$launchDir/results/input/sample2_error_report.csv").readLines() #assert lines.contains("sample2,[\'sample7\'],Reference sample2 ID and JSON key in sample7.mlst.json DO NOT MATCH. The 'sample7' key in sample7.mlst.json has been forcefully changed to 'sample2': User should manually check input files to ensure correctness.") - + lines = path("$launchDir/results/input/sampleR_error_report.csv").readLines() assert lines.contains("sampleR,[\'sampleF\'],Query sampleR ID and JSON key in sampleF.mlst.json DO NOT MATCH. The 'sampleF' key in sampleF.mlst.json has been forcefully changed to 'sampleR': User should manually check input files to ensure correctness.") @@ -176,7 +176,7 @@ nextflow_pipeline { assert lines.contains("sampleQ,2.2.3") assert lines.contains("sampleR,2.2.3") - // Check IRIDA Next JSON output + // Check IRIDA Next JSON output assert path("$launchDir/results/iridanext.output.json").json == path("$baseDir/tests/data/irida/mismatched_iridanext.output.json").json def iridanext_json = path("$launchDir/results/iridanext.output.json").json From 1f525294704b6b8a7c9110e2bdd35085578e8ddf Mon Sep 17 00:00:00 2001 From: kylacochrane Date: Wed, 12 Jun 2024 17:02:29 -0400 Subject: [PATCH 096/119] Fix EC issues --- tests/data/irida/mismatched_iridanext.output.json | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/tests/data/irida/mismatched_iridanext.output.json b/tests/data/irida/mismatched_iridanext.output.json index ec418dc..750523b 100644 --- a/tests/data/irida/mismatched_iridanext.output.json +++ b/tests/data/irida/mismatched_iridanext.output.json @@ -1,8 +1,6 @@ { "files": { - "global": [ - - ], + "global": [], "samples": { "sampleR": [ { @@ -26,4 +24,4 @@ } } } -} \ No newline at end of file +} From ec347e46b808810f5ebfd6fc532aed832f4e91d3 Mon Sep 17 00:00:00 2001 From: kylacochrane Date: Wed, 12 Jun 2024 17:08:30 -0400 Subject: [PATCH 097/119] Removed unexpected character (#) in main.nf.test --- tests/pipelines/main.nf.test | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/pipelines/main.nf.test b/tests/pipelines/main.nf.test index 43449ea..f1df721 100644 --- a/tests/pipelines/main.nf.test +++ b/tests/pipelines/main.nf.test @@ -166,7 +166,7 @@ nextflow_pipeline { // Ensure that the error_reports are generated for query and reference samples lines = path("$launchDir/results/input/sample2_error_report.csv").readLines() - #assert lines.contains("sample2,[\'sample7\'],Reference sample2 ID and JSON key in sample7.mlst.json DO NOT MATCH. The 'sample7' key in sample7.mlst.json has been forcefully changed to 'sample2': User should manually check input files to ensure correctness.") + assert lines.contains("sample2,[\'sample7\'],Reference sample2 ID and JSON key in sample7.mlst.json DO NOT MATCH. The 'sample7' key in sample7.mlst.json has been forcefully changed to 'sample2': User should manually check input files to ensure correctness.") lines = path("$launchDir/results/input/sampleR_error_report.csv").readLines() assert lines.contains("sampleR,[\'sampleF\'],Query sampleR ID and JSON key in sampleF.mlst.json DO NOT MATCH. The 'sampleF' key in sampleF.mlst.json has been forcefully changed to 'sampleR': User should manually check input files to ensure correctness.") From 7c1b5dc31536243a5870ed050f5b90286b33ff67 Mon Sep 17 00:00:00 2001 From: kylacochrane Date: Thu, 13 Jun 2024 10:13:08 -0400 Subject: [PATCH 098/119] Add test data for multiple keyed JSON file --- tests/data/reports/sample3_multiplekeys.mlst.json | 12 ++++++++++++ .../reports/sample3_multiplekeys_nomatch.mlst.json | 12 ++++++++++++ .../data/samplesheets/samplesheet-multiple_keys.csv | 5 +++++ .../samplesheet-multiplekeys_nomatch.csv | 5 +++++ 4 files changed, 34 insertions(+) create mode 100644 tests/data/reports/sample3_multiplekeys.mlst.json create mode 100644 tests/data/reports/sample3_multiplekeys_nomatch.mlst.json create mode 100644 tests/data/samplesheets/samplesheet-multiple_keys.csv create mode 100644 tests/data/samplesheets/samplesheet-multiplekeys_nomatch.csv diff --git a/tests/data/reports/sample3_multiplekeys.mlst.json b/tests/data/reports/sample3_multiplekeys.mlst.json new file mode 100644 index 0000000..5d85e65 --- /dev/null +++ b/tests/data/reports/sample3_multiplekeys.mlst.json @@ -0,0 +1,12 @@ +{ + "extra_key": { + "l1": "1", + "l2": "1", + "l3": "2" + }, + "sample3": { + "l1": "1", + "l2": "1", + "l3": "2" + } +} diff --git a/tests/data/reports/sample3_multiplekeys_nomatch.mlst.json b/tests/data/reports/sample3_multiplekeys_nomatch.mlst.json new file mode 100644 index 0000000..6d7878d --- /dev/null +++ b/tests/data/reports/sample3_multiplekeys_nomatch.mlst.json @@ -0,0 +1,12 @@ +{ + "sample4": { + "l1": "1", + "l2": "1", + "l3": "2" + }, + "extra_key": { + "l1": "1", + "l2": "1", + "l3": "2" + } +} diff --git a/tests/data/samplesheets/samplesheet-multiple_keys.csv b/tests/data/samplesheets/samplesheet-multiple_keys.csv new file mode 100644 index 0000000..867d7d6 --- /dev/null +++ b/tests/data/samplesheets/samplesheet-multiple_keys.csv @@ -0,0 +1,5 @@ +sample,mlst_alleles,address +sampleQ,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sampleQ.mlst.json, +sample1,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample1.mlst.json,1.1.1 +sample2,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample2.mlst.json,1.1.1 +sample3,https://raw.githubusercontent.com/phac-nml/gasnomenclature/input_assure/tests/data/reports/sample3_multiplekeys.mlst.json,1.1.2 diff --git a/tests/data/samplesheets/samplesheet-multiplekeys_nomatch.csv b/tests/data/samplesheets/samplesheet-multiplekeys_nomatch.csv new file mode 100644 index 0000000..cdd0bf0 --- /dev/null +++ b/tests/data/samplesheets/samplesheet-multiplekeys_nomatch.csv @@ -0,0 +1,5 @@ +sample,mlst_alleles,address +sampleQ,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sampleQ.mlst.json, +sample1,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample1.mlst.json,1.1.1 +sample2,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample2.mlst.json,1.1.1 +sample3,https://raw.githubusercontent.com/phac-nml/gasnomenclature/input_assure/tests/data/reports/sample3_multiplekeys_nomatch.mlst.json,1.1.2 From 8e8ffa446b816d973e5aa04888cc26d75b9c18b4 Mon Sep 17 00:00:00 2001 From: kylacochrane Date: Thu, 13 Jun 2024 11:21:17 -0400 Subject: [PATCH 099/119] Tests added to handle when there are multiple sample entries (keys) in provided MLST JSON file(s) --- .../irida/multiplekeys_iridanext.output.json | 19 ++++ tests/pipelines/main.nf.test | 97 +++++++++++++++++++ 2 files changed, 116 insertions(+) create mode 100644 tests/data/irida/multiplekeys_iridanext.output.json diff --git a/tests/data/irida/multiplekeys_iridanext.output.json b/tests/data/irida/multiplekeys_iridanext.output.json new file mode 100644 index 0000000..f7b872f --- /dev/null +++ b/tests/data/irida/multiplekeys_iridanext.output.json @@ -0,0 +1,19 @@ +{ + "files": { + "global": [], + "samples": { + "sample3": [ + { + "path": "input/sample3_error_report.csv" + } + ] + } + }, + "metadata": { + "samples": { + "sampleQ": { + "address": "1.1.3" + } + } + } +} diff --git a/tests/pipelines/main.nf.test b/tests/pipelines/main.nf.test index f1df721..4cec606 100644 --- a/tests/pipelines/main.nf.test +++ b/tests/pipelines/main.nf.test @@ -192,4 +192,101 @@ nextflow_pipeline { } } + test("Testing data removal in MLST JSON with a matching sampleID key."){ + // There are multiple sample entries (keys) in the MLST JSON and one of them matches the sampleID. + // This test evaluates the pipeline's ability to address this issue by removing keys that do not match the sampleID. + + tag "multiple_keys_with_matching_ID" + + when{ + params { + input = "$baseDir/tests/data/samplesheets/samplesheet-multiple_keys.csv" + outdir = "results" + } + } + + then { + assert workflow.success + assert path("$launchDir/results").exists() + + // Check called clusters + def actual_calls = path("$launchDir/results/call/Called/results.text") + def expected_calls = path("$baseDir/tests/data/called/expected_results.txt") + assert actual_calls.text == expected_calls.text + + // Check outputs + def lines = [] + + // Ensure that the error_reports are generated for query and reference samples + lines = path("$launchDir/results/input/sample3_error_report.csv").readLines() + assert lines.contains('sample3,"[\'extra_key\', \'sample3\']","MLST JSON file (sample3_multiplekeys.mlst.json) contains multiple keys: [\'extra_key\', \'sample3\']. The MLST JSON file has been modified to retain only the \'sample3\' entry"') + + // Check filtered query csv results + lines = path("$launchDir/results/filter/new_addresses.csv").readLines() + assert lines.contains("sampleQ,1.1.3") + + // Check IRIDA Next JSON output + assert path("$launchDir/results/iridanext.output.json").json == path("$baseDir/tests/data/irida/multiplekeys_iridanext.output.json").json + + def iridanext_json = path("$launchDir/results/iridanext.output.json").json + def iridanext_samples = iridanext_json.files.samples + def iridanext_metadata = iridanext_json.metadata.samples + + assert iridanext_samples.sample3.size() == 1 + assert iridanext_samples.sample3[0].path == 'input/sample3_error_report.csv' + + assert iridanext_metadata.size() == 1 + assert iridanext_metadata.containsKey("sampleQ") + assert iridanext_metadata.sampleQ."address" == "1.1.3" + } + } + + test("Testing the removal of data in MLST JSON with no sampleID match."){ + // There are multiple sample entries (keys) in the MLST JSON and none of them match the sampleID.. + // This test ensures the pipeline can handle and resolve this issue by retaining only the first JSON key entry and renaming it to match the sampleID. + + tag "multiple_keys_without_matching_ID" + + when{ + params { + input = "$baseDir/tests/data/samplesheets/samplesheet-multiplekeys_nomatch.csv" + outdir = "results" + } + } + + then { + assert workflow.success + assert path("$launchDir/results").exists() + + // Check called clusters + def actual_calls = path("$launchDir/results/call/Called/results.text") + def expected_calls = path("$baseDir/tests/data/called/expected_results.txt") + assert actual_calls.text == expected_calls.text + + // Check outputs + def lines = [] + + // Ensure that the error_reports are generated for query and reference samples + lines = path("$launchDir/results/input/sample3_error_report.csv").readLines() + assert lines.contains("sample3,\"[\'sample4\', \'extra_key\']\",No key in the MLST JSON file (sample3_multiplekeys_nomatch.mlst.json) matches the specified sample ID \'sample3\'. The first key \'sample4\' has been forcefully changed to \'sample3\' and all other keys have been removed.") + + // Check filtered query csv results + lines = path("$launchDir/results/filter/new_addresses.csv").readLines() + assert lines.contains("sampleQ,1.1.3") + + // Check IRIDA Next JSON output + assert path("$launchDir/results/iridanext.output.json").json == path("$baseDir/tests/data/irida/multiplekeys_iridanext.output.json").json + + def iridanext_json = path("$launchDir/results/iridanext.output.json").json + def iridanext_samples = iridanext_json.files.samples + def iridanext_metadata = iridanext_json.metadata.samples + + assert iridanext_samples.sample3.size() == 1 + assert iridanext_samples.sample3[0].path == 'input/sample3_error_report.csv' + + assert iridanext_metadata.size() == 1 + assert iridanext_metadata.containsKey("sampleQ") + assert iridanext_metadata.sampleQ."address" == "1.1.3" + } + } } From 79096738d4e792ceddf399c53d356d10f463bb55 Mon Sep 17 00:00:00 2001 From: kylacochrane Date: Thu, 13 Jun 2024 16:10:21 -0400 Subject: [PATCH 100/119] Updated input_assure to identify when MLST JSON is empty. Added corresponding test --- bin/input_assure.py | 12 ++++++++---- tests/data/reports/sample2_empty.mlst.json | 1 + .../data/samplesheets/samplesheet_emptyJSON.csv | 6 ++++++ tests/pipelines/main.nf.test | 16 ++++++++++++++++ 4 files changed, 31 insertions(+), 4 deletions(-) create mode 100644 tests/data/reports/sample2_empty.mlst.json create mode 100644 tests/data/samplesheets/samplesheet_emptyJSON.csv diff --git a/bin/input_assure.py b/bin/input_assure.py index 779e888..5e749b5 100755 --- a/bin/input_assure.py +++ b/bin/input_assure.py @@ -4,6 +4,7 @@ import argparse import csv import gzip +import sys def open_file(file_path, mode): @@ -13,7 +14,6 @@ def open_file(file_path, mode): else: return open(file_path, mode) - def check_inputs(json_file, sample_id, address, output_error_file): with open_file(json_file, "rt") as f: json_data = json.load(f) @@ -22,13 +22,17 @@ def check_inputs(json_file, sample_id, address, output_error_file): match_status = sample_id in json_data keys = list(json_data.keys()) - original_key = keys[0] + original_key = keys[0] if keys else None # Initialize the error message error_message = None # Check for multiple keys in the JSON file and define error message - if len(keys) > 1: + if len(keys) == 0: + error_message = f"{json_file} is completely empty!" + print(error_message) + sys.exit(1) + elif len(keys) > 1: # Check if sample_id matches any key if not match_status: error_message = f"No key in the MLST JSON file ({json_file}) matches the specified sample ID '{sample_id}'. The first key '{original_key}' has been forcefully changed to '{sample_id}' and all other keys have been removed." @@ -76,4 +80,4 @@ def check_inputs(json_file, sample_id, address, output_error_file): args = parser.parse_args() - check_inputs(args.input, args.sample_id, args.address, args.output_error) + check_inputs(args.input, args.sample_id, args.address, args.output_error) \ No newline at end of file diff --git a/tests/data/reports/sample2_empty.mlst.json b/tests/data/reports/sample2_empty.mlst.json new file mode 100644 index 0000000..0967ef4 --- /dev/null +++ b/tests/data/reports/sample2_empty.mlst.json @@ -0,0 +1 @@ +{} diff --git a/tests/data/samplesheets/samplesheet_emptyJSON.csv b/tests/data/samplesheets/samplesheet_emptyJSON.csv new file mode 100644 index 0000000..efcb1bb --- /dev/null +++ b/tests/data/samplesheets/samplesheet_emptyJSON.csv @@ -0,0 +1,6 @@ +sample,mlst_alleles,address +sampleQ,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sampleQ.mlst.json, +sample1,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample1.mlst.json,1.1.1 +sample2,https://raw.githubusercontent.com/phac-nml/gasnomenclature/input_assure/tests/data/reports/sample2_empty.mlst.json,1.1.1 +sample3,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample3.mlst.json,1.1.2 + diff --git a/tests/pipelines/main.nf.test b/tests/pipelines/main.nf.test index 4cec606..d292d1d 100644 --- a/tests/pipelines/main.nf.test +++ b/tests/pipelines/main.nf.test @@ -289,4 +289,20 @@ nextflow_pipeline { assert iridanext_metadata.sampleQ."address" == "1.1.3" } } + + test("Testing when provided MLST JSON file(s) are empty."){ + tag "empty_JSON" + + when{ + params { + input = "$baseDir/tests/data/samplesheets/samplesheet_emptyJSON.csv" + outdir = "results" + } + } + + then { + assert workflow.failed + assert (workflow.stdout =~ /sample2_empty.mlst.json is completely empty!/).find() + } + } } From da8c82992277aa2903727b5730b95544f92b1097 Mon Sep 17 00:00:00 2001 From: kylacochrane Date: Thu, 13 Jun 2024 16:12:28 -0400 Subject: [PATCH 101/119] EC issue fix --- bin/input_assure.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/bin/input_assure.py b/bin/input_assure.py index 5e749b5..5fabad4 100755 --- a/bin/input_assure.py +++ b/bin/input_assure.py @@ -14,6 +14,7 @@ def open_file(file_path, mode): else: return open(file_path, mode) + def check_inputs(json_file, sample_id, address, output_error_file): with open_file(json_file, "rt") as f: json_data = json.load(f) @@ -80,4 +81,4 @@ def check_inputs(json_file, sample_id, address, output_error_file): args = parser.parse_args() - check_inputs(args.input, args.sample_id, args.address, args.output_error) \ No newline at end of file + check_inputs(args.input, args.sample_id, args.address, args.output_error) From 6642b72ad0132805bf23083cf9979a8dde965941 Mon Sep 17 00:00:00 2001 From: kylacochrane Date: Thu, 13 Jun 2024 17:07:04 -0400 Subject: [PATCH 102/119] Create a new JSON output file in input_assure --- bin/input_assure.py | 17 +++++++++++------ modules/local/input_assure/main.nf | 5 +++-- tests/pipelines/main.nf.test | 5 ++--- 3 files changed, 16 insertions(+), 11 deletions(-) diff --git a/bin/input_assure.py b/bin/input_assure.py index 5fabad4..7926cab 100755 --- a/bin/input_assure.py +++ b/bin/input_assure.py @@ -15,20 +15,20 @@ def open_file(file_path, mode): return open(file_path, mode) -def check_inputs(json_file, sample_id, address, output_error_file): +def check_inputs(json_file, sample_id, address, output_error_file, output_json_file): with open_file(json_file, "rt") as f: json_data = json.load(f) # Define a variable to store the match_status (True or False) match_status = sample_id in json_data - keys = list(json_data.keys()) - original_key = keys[0] if keys else None - # Initialize the error message error_message = None # Check for multiple keys in the JSON file and define error message + keys = list(json_data.keys()) + original_key = keys[0] if keys else None + if len(keys) == 0: error_message = f"{json_file} is completely empty!" print(error_message) @@ -60,7 +60,7 @@ def check_inputs(json_file, sample_id, address, output_error_file): writer.writerow([sample_id, keys, error_message]) # Write the updated JSON data back to the original file - with open_file(json_file, "wt") as f: + with open_file(output_json_file, "wt") as f: json.dump(json_data, f, indent=4) @@ -78,7 +78,12 @@ def check_inputs(json_file, sample_id, address, output_error_file): parser.add_argument( "--output_error", help="Path to the error report file.", required=True ) + parser.add_argument( + "--output_json", help="Path to the MLST JSON file.", required=True + ) args = parser.parse_args() - check_inputs(args.input, args.sample_id, args.address, args.output_error) + check_inputs( + args.input, args.sample_id, args.address, args.output_error, args.output_json + ) diff --git a/modules/local/input_assure/main.nf b/modules/local/input_assure/main.nf index e0376ac..dd72bb1 100644 --- a/modules/local/input_assure/main.nf +++ b/modules/local/input_assure/main.nf @@ -10,7 +10,7 @@ process INPUT_ASSURE { tuple val(meta), path(mlst) output: - tuple val(meta), path(mlst), emit: result + tuple val(meta), path("${meta.id}.mlst.json"), emit: result tuple val(meta), path("*_error_report.csv"), optional: true, emit: error_report path("versions.yml"), emit: versions @@ -21,7 +21,8 @@ process INPUT_ASSURE { --input ${mlst} \\ --sample_id ${meta.id} \\ --address ${meta.address} \\ - --output_error ${meta.id}_error_report.csv + --output_error ${meta.id}_error_report.csv \\ + --output_json ${meta.id}.mlst.json cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/tests/pipelines/main.nf.test b/tests/pipelines/main.nf.test index d292d1d..b6a5ab8 100644 --- a/tests/pipelines/main.nf.test +++ b/tests/pipelines/main.nf.test @@ -123,9 +123,8 @@ nextflow_pipeline { assert workflow.success assert path("$launchDir/results").exists() - // Check is sample1.mlst.json.gz exists and is gzipped - def gzipped_json = path("$launchDir/results/input/sample1.mlst.json.gz") - assert gzipped_json.exists() + // Check that sample1.mlst.json.gz has been open, read, and that a new file has been generated + assert path("$launchDir/results/input/sample1.mlst.json").exists() // Check called clusters def actual_calls = path("$launchDir/results/call/Called/results.text") From 348fe9558c27e1ef76ba387c4abc31632656c1aa Mon Sep 17 00:00:00 2001 From: kylacochrane Date: Fri, 14 Jun 2024 12:01:48 -0400 Subject: [PATCH 103/119] Ensure MLST JSON files from input_assure are gzipped --- bin/input_assure.py | 4 ++-- modules/local/input_assure/main.nf | 4 ++-- tests/data/irida/test2_iridanext.output.json | 19 ------------------ tests/data/reports/sample1.mlst.json.gz | Bin 84 -> 84 bytes tests/data/samplesheets/samplesheet_test1.csv | 5 ----- tests/data/samplesheets/samplesheet_test2.csv | 7 ------- tests/pipelines/main.nf.test | 4 ++-- 7 files changed, 6 insertions(+), 37 deletions(-) delete mode 100644 tests/data/irida/test2_iridanext.output.json delete mode 100644 tests/data/samplesheets/samplesheet_test1.csv delete mode 100644 tests/data/samplesheets/samplesheet_test2.csv diff --git a/bin/input_assure.py b/bin/input_assure.py index 7926cab..d99bf2a 100755 --- a/bin/input_assure.py +++ b/bin/input_assure.py @@ -60,7 +60,7 @@ def check_inputs(json_file, sample_id, address, output_error_file, output_json_f writer.writerow([sample_id, keys, error_message]) # Write the updated JSON data back to the original file - with open_file(output_json_file, "wt") as f: + with gzip.open(output_json_file, "wt") as f: json.dump(json_data, f, indent=4) @@ -79,7 +79,7 @@ def check_inputs(json_file, sample_id, address, output_error_file, output_json_f "--output_error", help="Path to the error report file.", required=True ) parser.add_argument( - "--output_json", help="Path to the MLST JSON file.", required=True + "--output_json", help="Path to the MLST JSON file (gzipped).", required=True ) args = parser.parse_args() diff --git a/modules/local/input_assure/main.nf b/modules/local/input_assure/main.nf index dd72bb1..43b7462 100644 --- a/modules/local/input_assure/main.nf +++ b/modules/local/input_assure/main.nf @@ -10,7 +10,7 @@ process INPUT_ASSURE { tuple val(meta), path(mlst) output: - tuple val(meta), path("${meta.id}.mlst.json"), emit: result + tuple val(meta), path("${meta.id}.mlst.json.gz"), emit: result tuple val(meta), path("*_error_report.csv"), optional: true, emit: error_report path("versions.yml"), emit: versions @@ -22,7 +22,7 @@ process INPUT_ASSURE { --sample_id ${meta.id} \\ --address ${meta.address} \\ --output_error ${meta.id}_error_report.csv \\ - --output_json ${meta.id}.mlst.json + --output_json ${meta.id}.mlst.json.gz cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/tests/data/irida/test2_iridanext.output.json b/tests/data/irida/test2_iridanext.output.json deleted file mode 100644 index 2882954..0000000 --- a/tests/data/irida/test2_iridanext.output.json +++ /dev/null @@ -1,19 +0,0 @@ -{ - "files": { - "global": [], - "samples": { - "sampleR": [ - { - "path": "input/sampleR_error_report.csv" - } - ] - } - }, - "metadata": { - "samples": { - "sampleQ": { - "address": "1.1.3" - } - } - } -} diff --git a/tests/data/reports/sample1.mlst.json.gz b/tests/data/reports/sample1.mlst.json.gz index 94f25c81407bfee01777e59fb4af80f5998dbf30..735e1082b5193673c4844e4f4558af8e8206a12f 100644 GIT binary patch delta 13 UcmWFuVVCdb;5f#WHIdyH02iwRi2wiq delta 13 UcmWFuVVCdb;5hm}b0WJh032!rd;kCd diff --git a/tests/data/samplesheets/samplesheet_test1.csv b/tests/data/samplesheets/samplesheet_test1.csv deleted file mode 100644 index cf87b26..0000000 --- a/tests/data/samplesheets/samplesheet_test1.csv +++ /dev/null @@ -1,5 +0,0 @@ -sample,mlst_alleles,address -sampleQ,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sampleF.mlst.json, -sample1,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample1.mlst.json,1.1.1 -sample2,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample7.mlst.json,1.1.1 -sample3,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample3.mlst.json,1.1.2 diff --git a/tests/data/samplesheets/samplesheet_test2.csv b/tests/data/samplesheets/samplesheet_test2.csv deleted file mode 100644 index 036c317..0000000 --- a/tests/data/samplesheets/samplesheet_test2.csv +++ /dev/null @@ -1,7 +0,0 @@ -sample,mlst_alleles,address -sampleR,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sampleF.mlst.json, -sampleQ,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sampleQ.mlst.json, -sample1,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample1.mlst.json,1.1.1 -sample2,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample2.mlst.json,1.1.1 -sample3,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample3.mlst.json,1.1.2 - diff --git a/tests/pipelines/main.nf.test b/tests/pipelines/main.nf.test index b6a5ab8..6716dae 100644 --- a/tests/pipelines/main.nf.test +++ b/tests/pipelines/main.nf.test @@ -123,8 +123,8 @@ nextflow_pipeline { assert workflow.success assert path("$launchDir/results").exists() - // Check that sample1.mlst.json.gz has been open, read, and that a new file has been generated - assert path("$launchDir/results/input/sample1.mlst.json").exists() + // Check that sample1.mlst.json.gz has been open, read, and that a new gzipped file has been generated + assert path("$launchDir/results/input/sample1.mlst.json.gz").exists() // Check called clusters def actual_calls = path("$launchDir/results/call/Called/results.text") From 327e46b84ec71384111c83f568224c25a54c5335 Mon Sep 17 00:00:00 2001 From: kylacochrane Date: Fri, 14 Jun 2024 16:46:32 -0400 Subject: [PATCH 104/119] Update samplesheet paths to mlst.json reports --- tests/data/samplesheets/samplesheet-multiple_keys.csv | 2 +- tests/data/samplesheets/samplesheet-multiple_queries.csv | 2 +- tests/data/samplesheets/samplesheet-multiplekeys_nomatch.csv | 2 +- tests/data/samplesheets/samplesheet_emptyJSON.csv | 2 +- tests/data/samplesheets/samplesheet_gzip.csv | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/data/samplesheets/samplesheet-multiple_keys.csv b/tests/data/samplesheets/samplesheet-multiple_keys.csv index 867d7d6..74f034a 100644 --- a/tests/data/samplesheets/samplesheet-multiple_keys.csv +++ b/tests/data/samplesheets/samplesheet-multiple_keys.csv @@ -2,4 +2,4 @@ sample,mlst_alleles,address sampleQ,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sampleQ.mlst.json, sample1,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample1.mlst.json,1.1.1 sample2,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample2.mlst.json,1.1.1 -sample3,https://raw.githubusercontent.com/phac-nml/gasnomenclature/input_assure/tests/data/reports/sample3_multiplekeys.mlst.json,1.1.2 +sample3,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample3_multiplekeys.mlst.json,1.1.2 diff --git a/tests/data/samplesheets/samplesheet-multiple_queries.csv b/tests/data/samplesheets/samplesheet-multiple_queries.csv index e429a1c..c8e76de 100644 --- a/tests/data/samplesheets/samplesheet-multiple_queries.csv +++ b/tests/data/samplesheets/samplesheet-multiple_queries.csv @@ -1,6 +1,6 @@ sample,mlst_alleles,address sampleQ,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sampleQ.mlst.json, -sampleN,https://raw.githubusercontent.com/phac-nml/gasnomenclature/update-filter_query/tests/data/reports/sampleN.mlst.json, +sampleN,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sampleN.mlst.json, sample1,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample1.mlst.json,1.1.1 sample2,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample2.mlst.json,1.1.1 sample3,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample3.mlst.json,1.1.2 diff --git a/tests/data/samplesheets/samplesheet-multiplekeys_nomatch.csv b/tests/data/samplesheets/samplesheet-multiplekeys_nomatch.csv index cdd0bf0..90d6289 100644 --- a/tests/data/samplesheets/samplesheet-multiplekeys_nomatch.csv +++ b/tests/data/samplesheets/samplesheet-multiplekeys_nomatch.csv @@ -2,4 +2,4 @@ sample,mlst_alleles,address sampleQ,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sampleQ.mlst.json, sample1,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample1.mlst.json,1.1.1 sample2,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample2.mlst.json,1.1.1 -sample3,https://raw.githubusercontent.com/phac-nml/gasnomenclature/input_assure/tests/data/reports/sample3_multiplekeys_nomatch.mlst.json,1.1.2 +sample3,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample3_multiplekeys_nomatch.mlst.json,1.1.2 diff --git a/tests/data/samplesheets/samplesheet_emptyJSON.csv b/tests/data/samplesheets/samplesheet_emptyJSON.csv index efcb1bb..0b84688 100644 --- a/tests/data/samplesheets/samplesheet_emptyJSON.csv +++ b/tests/data/samplesheets/samplesheet_emptyJSON.csv @@ -1,6 +1,6 @@ sample,mlst_alleles,address sampleQ,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sampleQ.mlst.json, sample1,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample1.mlst.json,1.1.1 -sample2,https://raw.githubusercontent.com/phac-nml/gasnomenclature/input_assure/tests/data/reports/sample2_empty.mlst.json,1.1.1 +sample2,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample2_empty.mlst.json,1.1.1 sample3,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample3.mlst.json,1.1.2 diff --git a/tests/data/samplesheets/samplesheet_gzip.csv b/tests/data/samplesheets/samplesheet_gzip.csv index 2337c78..e35b3e9 100644 --- a/tests/data/samplesheets/samplesheet_gzip.csv +++ b/tests/data/samplesheets/samplesheet_gzip.csv @@ -1,5 +1,5 @@ sample,mlst_alleles,address sampleQ,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sampleQ.mlst.json, -sample1,https://raw.githubusercontent.com/phac-nml/gasnomenclature/input_assure/tests/data/reports/sample1.mlst.json.gz,1.1.1 +sample1,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample1.mlst.json.gz,1.1.1 sample2,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample2.mlst.json,1.1.1 sample3,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample3.mlst.json,1.1.2 From 95c0f5ca18a3d6c4176e72b0c9daaf25a5e872b2 Mon Sep 17 00:00:00 2001 From: kylacochrane Date: Mon, 17 Jun 2024 12:58:54 -0400 Subject: [PATCH 105/119] Add gas parameter tests --- tests/pipelines/main_gm_threshold.nf.test | 135 ++++++++++++++++++++++ workflows/gas_nomenclature.nf | 22 +++- 2 files changed, 156 insertions(+), 1 deletion(-) create mode 100644 tests/pipelines/main_gm_threshold.nf.test diff --git a/tests/pipelines/main_gm_threshold.nf.test b/tests/pipelines/main_gm_threshold.nf.test new file mode 100644 index 0000000..a060db7 --- /dev/null +++ b/tests/pipelines/main_gm_threshold.nf.test @@ -0,0 +1,135 @@ +nextflow_pipeline { + + name "Integration Tests of adjusting gm_thresholds parameters" + script "main.nf" + + test("Test fail pipeline if null threshold set") { + tag "pipeline_failure_null_threshold" + + when { + params { + input = "$baseDir/tests/data/samplesheets/samplesheet1.csv" + outdir = "results" + + gm_thresholds = null + } + } + + then { + assert workflow.failed + assert workflow.stdout.contains("ERROR ~ --gm_thresholds null: Cannot pass null or empty string") + } + } + + test("Test fail pipeline if empty threshold set") { + tag "pipeline_failure_no_threshold" + + when { + params { + input = "$baseDir/tests/data/samplesheets/samplesheet1.csv" + outdir = "results" + + gm_thresholds = "" + } + } + + then { + assert workflow.failed + assert workflow.stdout.contains("ERROR ~ --gm_thresholds : Cannot pass null or empty string") + } + } + + test("Test fail pipeline if negative threshold set") { + tag "pipeline_failure_negative_threshold" + + when { + params { + input = "$baseDir/tests/data/samplesheets/samplesheet1.csv" + outdir = "results" + + gm_thresholds = "-1" + } + } + + then { + assert workflow.failed + assert workflow.stderr.contains('* --gm_thresholds: string [-1] does not match pattern ^(\\d+(\\.\\d+)?,)*\\d+(\\.\\d+)?$ (-1)') + } + } + + test("Test fail pipeline if mismatch between thresholds and scaled distm") { + tag "pipeline_failure_threshold_scaled" + + when { + params { + input = "$baseDir/tests/data/samplesheets/samplesheet1.csv" + outdir = "results" + + gm_thresholds = "1,0.5,2" + pd_distm = "scaled" + } + } + + then { + assert workflow.failed + assert workflow.stdout.contains("ERROR ~ '--pd_distm scaled' is set, but '--gm_thresholds 1,0.5,2' contains thresholds outside of range [0,1]." + + " Please either set '--pd_distm hamming' or adjust the threshold values.") + } + } + + test("Test fail pipeline if mismatch between thresholds and hamming distm") { + tag "pipeline_failure_threshold_hamming" + + when { + params { + input = "$baseDir/tests/data/samplesheets/samplesheet1.csv" + outdir = "results" + + gm_thresholds = "2,1,0.5" + pd_distm = "hamming" + } + } + + then { + assert workflow.failed + assert workflow.stdout.contains("ERROR ~ '--pd_distm hamming' is set, but '--gm_thresholds 2,1,0.5' contains fractions." + + " Please either set '--pd_distm scaled' or remove fractions from distance thresholds.") + } + } + + test("Test pipeline with single threshold set to 1") { + tag "pipeline_thresh_1" + + when { + params { + input = "$baseDir/tests/data/samplesheets/samplesheet1.csv" + outdir = "results" + + gm_thresholds = "1" + } + } + + then { + assert workflow.failed + assert (workflow.stdout =~ /Error \[1.0\] supplied thresholds do not equal the number of threshold columns in reference_clusters.txt/).find() + } + } + + test("Test pipeline with threshold set to 1,0") { + tag "pipeline_thresh_1.0" + + when { + params { + input = "$baseDir/tests/data/samplesheets/samplesheet1.csv" + outdir = "results" + + gm_thresholds = "1,0" + } + } + + then { + assert workflow.failed + assert (workflow.stdout =~ /Error \[1.0, 0.0\] supplied thresholds do not equal the number of threshold columns in reference_clusters.txt/).find() + } + } +} diff --git a/workflows/gas_nomenclature.nf b/workflows/gas_nomenclature.nf index 6225f3a..b0b05bb 100644 --- a/workflows/gas_nomenclature.nf +++ b/workflows/gas_nomenclature.nf @@ -119,7 +119,27 @@ workflow GAS_NOMENCLATURE { expected_clusters = CLUSTER_FILE(clusters) - // GAS CALL + // GAS CALL processes + + if(params.gm_thresholds == null || params.gm_thresholds == ""){ + exit 1, "--gm_thresholds ${params.gm_thresholds}: Cannot pass null or empty string" + } + + gm_thresholds_list = params.gm_thresholds.split(',') + if (params.pd_distm == 'hamming') { + if (gm_thresholds_list.any { it != null && it.contains('.') }) { + exit 1, ("'--pd_distm ${params.pd_distm}' is set, but '--gm_thresholds ${params.gm_thresholds}' contains fractions." + + " Please either set '--pd_distm scaled' or remove fractions from distance thresholds.") + } + } else if (params.pd_distm == 'scaled') { + if (gm_thresholds_list.any { it != null && (it as Float < 0 || it as Float > 1) }) { + exit 1, ("'--pd_distm ${params.pd_distm}' is set, but '--gm_thresholds ${params.gm_thresholds}' contains thresholds outside of range [0,1]." + + " Please either set '--pd_distm hamming' or adjust the threshold values.") + } + } else { + exit 1, "'--pd_distm ${params.pd_distm}' is an invalid value. Please set to either 'hamming' or 'scaled'." + } + called_data = GAS_CALL(expected_clusters.text, distances.results) ch_versions = ch_versions.mix(called_data.versions) From 0b2ba27673c50398fbb4fcd8e51f58ee8278777c Mon Sep 17 00:00:00 2001 From: kylacochrane Date: Mon, 17 Jun 2024 14:56:01 -0400 Subject: [PATCH 106/119] Add test for hash-missing --- .../data/called/expected_results_missing.txt | 5 ++ .../clusters/expected_clusters_missing.txt | 4 ++ .../data/distances/expected_dists_missing.txt | 5 ++ .../data/irida/missing_iridanext.output.json | 17 ++++++ .../profiles/expected-profile_missing1.tsv | 5 ++ .../profiles/expected-profile_missing2.tsv | 2 + tests/data/reports/sample2_missing.mlst.json | 7 +++ tests/data/reports/sample3_missing.mlst.json | 7 +++ .../samplesheets/samplesheet-hash_missing.csv | 6 ++ tests/pipelines/main_missing_alleles.nf.test | 61 +++++++++++++++++++ 10 files changed, 119 insertions(+) create mode 100644 tests/data/called/expected_results_missing.txt create mode 100644 tests/data/clusters/expected_clusters_missing.txt create mode 100644 tests/data/distances/expected_dists_missing.txt create mode 100644 tests/data/irida/missing_iridanext.output.json create mode 100644 tests/data/profiles/expected-profile_missing1.tsv create mode 100644 tests/data/profiles/expected-profile_missing2.tsv create mode 100644 tests/data/reports/sample2_missing.mlst.json create mode 100644 tests/data/reports/sample3_missing.mlst.json create mode 100644 tests/data/samplesheets/samplesheet-hash_missing.csv create mode 100644 tests/pipelines/main_missing_alleles.nf.test diff --git a/tests/data/called/expected_results_missing.txt b/tests/data/called/expected_results_missing.txt new file mode 100644 index 0000000..26b264c --- /dev/null +++ b/tests/data/called/expected_results_missing.txt @@ -0,0 +1,5 @@ +id address level_1 +sample1 1 1 +sample2 1 1 +sample3 2 2 +sampleQ 1 1 diff --git a/tests/data/clusters/expected_clusters_missing.txt b/tests/data/clusters/expected_clusters_missing.txt new file mode 100644 index 0000000..186ff1d --- /dev/null +++ b/tests/data/clusters/expected_clusters_missing.txt @@ -0,0 +1,4 @@ +id address level_1 +sample1 1 1 +sample2 1 1 +sample3 2 2 diff --git a/tests/data/distances/expected_dists_missing.txt b/tests/data/distances/expected_dists_missing.txt new file mode 100644 index 0000000..84ea004 --- /dev/null +++ b/tests/data/distances/expected_dists_missing.txt @@ -0,0 +1,5 @@ +query_id ref_id dist +sampleQ sampleQ 0 +sampleQ sample1 1 +sampleQ sample2 1 +sampleQ sample3 2 diff --git a/tests/data/irida/missing_iridanext.output.json b/tests/data/irida/missing_iridanext.output.json new file mode 100644 index 0000000..2945ee7 --- /dev/null +++ b/tests/data/irida/missing_iridanext.output.json @@ -0,0 +1,17 @@ +{ + "files": { + "global": [ + + ], + "samples": { + + } + }, + "metadata": { + "samples": { + "sampleQ": { + "address": "1" + } + } + } +} \ No newline at end of file diff --git a/tests/data/profiles/expected-profile_missing1.tsv b/tests/data/profiles/expected-profile_missing1.tsv new file mode 100644 index 0000000..6d37496 --- /dev/null +++ b/tests/data/profiles/expected-profile_missing1.tsv @@ -0,0 +1,5 @@ +sample_id l1 l2 l3 +sampleQ 1 2 1 +sample1 1 1 1 +sample2 - 1 1 +sample3 - 1 2 diff --git a/tests/data/profiles/expected-profile_missing2.tsv b/tests/data/profiles/expected-profile_missing2.tsv new file mode 100644 index 0000000..44020cb --- /dev/null +++ b/tests/data/profiles/expected-profile_missing2.tsv @@ -0,0 +1,2 @@ +sample_id l1 l2 l3 +sampleQ 1 2 1 diff --git a/tests/data/reports/sample2_missing.mlst.json b/tests/data/reports/sample2_missing.mlst.json new file mode 100644 index 0000000..113e15b --- /dev/null +++ b/tests/data/reports/sample2_missing.mlst.json @@ -0,0 +1,7 @@ +{ + "sample2": { + "l1": "-", + "l2": "1", + "l3": "1" + } +} diff --git a/tests/data/reports/sample3_missing.mlst.json b/tests/data/reports/sample3_missing.mlst.json new file mode 100644 index 0000000..49942f8 --- /dev/null +++ b/tests/data/reports/sample3_missing.mlst.json @@ -0,0 +1,7 @@ +{ + "sample3": { + "l1": "-", + "l2": "1", + "l3": "2" + } +} diff --git a/tests/data/samplesheets/samplesheet-hash_missing.csv b/tests/data/samplesheets/samplesheet-hash_missing.csv new file mode 100644 index 0000000..025ed36 --- /dev/null +++ b/tests/data/samplesheets/samplesheet-hash_missing.csv @@ -0,0 +1,6 @@ +sample,mlst_alleles,address +sampleQ,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sampleQ.mlst.json, +sample1,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample1.mlst.json,1 +sample2,https://raw.githubusercontent.com/phac-nml/gasnomenclature/add_tests/tests/data/reports/sample2_missing.mlst.json,1 +sample3,https://raw.githubusercontent.com/phac-nml/gasnomenclature/add_tests/tests/data/reports/sample3_missing.mlst.json,2 + diff --git a/tests/pipelines/main_missing_alleles.nf.test b/tests/pipelines/main_missing_alleles.nf.test new file mode 100644 index 0000000..2d793a6 --- /dev/null +++ b/tests/pipelines/main_missing_alleles.nf.test @@ -0,0 +1,61 @@ +nextflow_pipeline { + + name "Integration Tests for parameters dealing with missing or removed alleles" + script "main.nf" + + test("Full pipeline hashes and missing data") { + tag "pipeline_hashes_missing" + + when { + params { + input = "$baseDir/tests/data/samplesheets/samplesheet-hash_missing.csv" + outdir = "results" + + gm_thresholds = "1" + } + } + + then { + assert workflow.success + assert path("$launchDir/results").exists() + + // Check merged profiles + def actual_profile_ref = path("$launchDir/results/locidex/merge/reference/merged_ref/merged_profiles_ref.tsv") + def expected_profile_tsv = path("$baseDir/tests/data/profiles/expected-profile_missing1.tsv") + assert actual_profile_ref.text == expected_profile_tsv.text + + // Check query profiles + def actual_profile_query = path("$launchDir/results/locidex/merge/query/merged_value/merged_profiles_value.tsv") + def expected_profile_query_tsv = path("$baseDir/tests/data/profiles/expected-profile_missing2.tsv") + assert actual_profile_query.text == expected_profile_query_tsv.text + + // Check computed pairwise distances + def actual_distances = path("$launchDir/results/distances/results.text") + def expected_distances = path("$baseDir/tests/data/distances/expected_dists_missing.txt") + assert actual_distances.text == expected_distances.text + + // Verify cluster file + def actual_cluster = path("$launchDir/results/cluster/reference_clusters.txt") + def expected_cluster = path("$baseDir/tests/data/clusters/expected_clusters_missing.txt") + assert actual_cluster.text == expected_cluster.text + + // Check called clusters + def actual_calls = path("$launchDir/results/call/Called/results.text") + def expected_calls = path("$baseDir/tests/data/called/expected_results_missing.txt") + assert actual_calls.text == expected_calls.text + + // Check IRIDA Next JSON output + assert path("$launchDir/results/iridanext.output.json").json == path("$baseDir/tests/data/irida/missing_iridanext.output.json").json + + def iridanext_json = path("$launchDir/results/iridanext.output.json").json + def iridanext_samples = iridanext_json.files.samples + def iridanext_metadata = iridanext_json.metadata.samples + + assert iridanext_metadata.size() == 1 && iridanext_metadata.containsKey("sampleQ") + assert iridanext_metadata.sampleQ."address" == "1" + } + } + + +} + From 27ff57945bbc0e6270c8f98d2c200f1e84fa13f7 Mon Sep 17 00:00:00 2001 From: kylacochrane Date: Mon, 17 Jun 2024 15:20:26 -0400 Subject: [PATCH 107/119] Added test when counting missing data --- .../called/expected_results_count-missing.txt | 5 +++ .../expected_dists_count-missing.txt | 5 +++ .../irida/count-missing_iridanext.output.json | 13 ++++++ .../data/irida/missing_iridanext.output.json | 10 ++--- .../samplesheets/samplesheet-hash_missing.csv | 1 - tests/pipelines/main_missing_alleles.nf.test | 40 ++++++++++++++++++- 6 files changed, 65 insertions(+), 9 deletions(-) create mode 100644 tests/data/called/expected_results_count-missing.txt create mode 100644 tests/data/distances/expected_dists_count-missing.txt create mode 100644 tests/data/irida/count-missing_iridanext.output.json diff --git a/tests/data/called/expected_results_count-missing.txt b/tests/data/called/expected_results_count-missing.txt new file mode 100644 index 0000000..26b264c --- /dev/null +++ b/tests/data/called/expected_results_count-missing.txt @@ -0,0 +1,5 @@ +id address level_1 +sample1 1 1 +sample2 1 1 +sample3 2 2 +sampleQ 1 1 diff --git a/tests/data/distances/expected_dists_count-missing.txt b/tests/data/distances/expected_dists_count-missing.txt new file mode 100644 index 0000000..1313023 --- /dev/null +++ b/tests/data/distances/expected_dists_count-missing.txt @@ -0,0 +1,5 @@ +query_id ref_id dist +sampleQ sampleQ 0 +sampleQ sample1 1 +sampleQ sample2 2 +sampleQ sample3 3 diff --git a/tests/data/irida/count-missing_iridanext.output.json b/tests/data/irida/count-missing_iridanext.output.json new file mode 100644 index 0000000..2f0745e --- /dev/null +++ b/tests/data/irida/count-missing_iridanext.output.json @@ -0,0 +1,13 @@ +{ + "files": { + "global": [], + "samples": {} + }, + "metadata": { + "samples": { + "sampleQ": { + "address": "1" + } + } + } +} diff --git a/tests/data/irida/missing_iridanext.output.json b/tests/data/irida/missing_iridanext.output.json index 2945ee7..2f0745e 100644 --- a/tests/data/irida/missing_iridanext.output.json +++ b/tests/data/irida/missing_iridanext.output.json @@ -1,11 +1,7 @@ { "files": { - "global": [ - - ], - "samples": { - - } + "global": [], + "samples": {} }, "metadata": { "samples": { @@ -14,4 +10,4 @@ } } } -} \ No newline at end of file +} diff --git a/tests/data/samplesheets/samplesheet-hash_missing.csv b/tests/data/samplesheets/samplesheet-hash_missing.csv index 025ed36..bce4982 100644 --- a/tests/data/samplesheets/samplesheet-hash_missing.csv +++ b/tests/data/samplesheets/samplesheet-hash_missing.csv @@ -3,4 +3,3 @@ sampleQ,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/dat sample1,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample1.mlst.json,1 sample2,https://raw.githubusercontent.com/phac-nml/gasnomenclature/add_tests/tests/data/reports/sample2_missing.mlst.json,1 sample3,https://raw.githubusercontent.com/phac-nml/gasnomenclature/add_tests/tests/data/reports/sample3_missing.mlst.json,2 - diff --git a/tests/pipelines/main_missing_alleles.nf.test b/tests/pipelines/main_missing_alleles.nf.test index 2d793a6..3b54854 100644 --- a/tests/pipelines/main_missing_alleles.nf.test +++ b/tests/pipelines/main_missing_alleles.nf.test @@ -56,6 +56,44 @@ nextflow_pipeline { } } - + test("Full pipeline hashes and missing data count missing as differences") { + tag "pipeline_hashes_missing_count_missing" + + when { + params { + input = "$baseDir/tests/data/samplesheets/samplesheet-hash_missing.csv" + outdir = "results" + + gm_thresholds = "1" + pd_count_missing = true + } + } + + then { + assert workflow.success + assert path("$launchDir/results").exists() + + // Check computed pairwise distances + def actual_distances = path("$launchDir/results/distances/results.text") + def expected_distances = path("$baseDir/tests/data/distances/expected_dists_count-missing.txt") + assert actual_distances.text == expected_distances.text + + // Check called clusters + def actual_calls = path("$launchDir/results/call/Called/results.text") + def expected_calls = path("$baseDir/tests/data/called/expected_results_count-missing.txt") + assert actual_calls.text == expected_calls.text + + // Check IRIDA Next JSON output + assert path("$launchDir/results/iridanext.output.json").json == path("$baseDir/tests/data/irida/count-missing_iridanext.output.json").json + + def iridanext_json = path("$launchDir/results/iridanext.output.json").json + def iridanext_samples = iridanext_json.files.samples + def iridanext_metadata = iridanext_json.metadata.samples + + assert iridanext_metadata.size() == 1 && iridanext_metadata.containsKey("sampleQ") + assert iridanext_metadata.sampleQ."address" == "1" + } + } + } From 23ec4498ec068b6492dbb0cf2f2ae06374bda9b5 Mon Sep 17 00:00:00 2001 From: kylacochrane Date: Mon, 17 Jun 2024 15:35:14 -0400 Subject: [PATCH 108/119] Added test for removing missing loci --- .../called/expected_results_loci-missing.txt | 5 +++ .../distances/expected_dists_loci-missing.txt | 5 +++ .../irida/loci-missing_iridanext.output.json | 13 +++++++ tests/pipelines/main_missing_alleles.nf.test | 39 +++++++++++++++++++ 4 files changed, 62 insertions(+) create mode 100644 tests/data/called/expected_results_loci-missing.txt create mode 100644 tests/data/distances/expected_dists_loci-missing.txt create mode 100644 tests/data/irida/loci-missing_iridanext.output.json diff --git a/tests/data/called/expected_results_loci-missing.txt b/tests/data/called/expected_results_loci-missing.txt new file mode 100644 index 0000000..26b264c --- /dev/null +++ b/tests/data/called/expected_results_loci-missing.txt @@ -0,0 +1,5 @@ +id address level_1 +sample1 1 1 +sample2 1 1 +sample3 2 2 +sampleQ 1 1 diff --git a/tests/data/distances/expected_dists_loci-missing.txt b/tests/data/distances/expected_dists_loci-missing.txt new file mode 100644 index 0000000..1313023 --- /dev/null +++ b/tests/data/distances/expected_dists_loci-missing.txt @@ -0,0 +1,5 @@ +query_id ref_id dist +sampleQ sampleQ 0 +sampleQ sample1 1 +sampleQ sample2 2 +sampleQ sample3 3 diff --git a/tests/data/irida/loci-missing_iridanext.output.json b/tests/data/irida/loci-missing_iridanext.output.json new file mode 100644 index 0000000..2f0745e --- /dev/null +++ b/tests/data/irida/loci-missing_iridanext.output.json @@ -0,0 +1,13 @@ +{ + "files": { + "global": [], + "samples": {} + }, + "metadata": { + "samples": { + "sampleQ": { + "address": "1" + } + } + } +} diff --git a/tests/pipelines/main_missing_alleles.nf.test b/tests/pipelines/main_missing_alleles.nf.test index 3b54854..7d24c06 100644 --- a/tests/pipelines/main_missing_alleles.nf.test +++ b/tests/pipelines/main_missing_alleles.nf.test @@ -95,5 +95,44 @@ nextflow_pipeline { } } + test("Full pipeline remove loci with missing data") { + tag "pipeline_hashes_remove_missing_loci" + + when { + params { + input = "$baseDir/tests/data/samplesheets/samplesheet-hash_missing.csv" + outdir = "results" + + gm_thresholds = "1" + pd_count_missing = true + pd_missing_threshold = 0.5 + } + } + + then { + assert workflow.success + assert path("$launchDir/results").exists() + + // Check computed pairwise distances + def actual_distances = path("$launchDir/results/distances/results.text") + def expected_distances = path("$baseDir/tests/data/distances/expected_dists_loci-missing.txt") + assert actual_distances.text == expected_distances.text + + // Check called clusters + def actual_calls = path("$launchDir/results/call/Called/results.text") + def expected_calls = path("$baseDir/tests/data/called/expected_results_loci-missing.txt") + assert actual_calls.text == expected_calls.text + + // Check IRIDA Next JSON output + assert path("$launchDir/results/iridanext.output.json").json == path("$baseDir/tests/data/irida/loci-missing_iridanext.output.json").json + + def iridanext_json = path("$launchDir/results/iridanext.output.json").json + def iridanext_samples = iridanext_json.files.samples + def iridanext_metadata = iridanext_json.metadata.samples + + assert iridanext_metadata.size() == 1 && iridanext_metadata.containsKey("sampleQ") + assert iridanext_metadata.sampleQ."address" == "1" + } + } } From 01b1e4b0f6c27ea8b5c2c7f4397f22afb6f30309 Mon Sep 17 00:00:00 2001 From: kylacochrane Date: Mon, 17 Jun 2024 16:32:10 -0400 Subject: [PATCH 109/119] Added tests with pd_columns --- .../columns/keep-zero-loci-empty-file.txt | 1 + tests/pipelines/main_missing_alleles.nf.test | 36 +++++++++++++++++++ 2 files changed, 37 insertions(+) create mode 100644 tests/data/columns/keep-zero-loci-empty-file.txt diff --git a/tests/data/columns/keep-zero-loci-empty-file.txt b/tests/data/columns/keep-zero-loci-empty-file.txt new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/tests/data/columns/keep-zero-loci-empty-file.txt @@ -0,0 +1 @@ + diff --git a/tests/pipelines/main_missing_alleles.nf.test b/tests/pipelines/main_missing_alleles.nf.test index 7d24c06..51b2e71 100644 --- a/tests/pipelines/main_missing_alleles.nf.test +++ b/tests/pipelines/main_missing_alleles.nf.test @@ -134,5 +134,41 @@ nextflow_pipeline { assert iridanext_metadata.sampleQ."address" == "1" } } + + test("Test fail pipeline if non-existent columns file is passed") { + tag "pipeline_failure_columns_no_exist" + + when { + params { + input = "$baseDir/tests/data/samplesheets/samplesheet-hash_missing.csv" + outdir = "results" + + pd_columns = "./no-exist" + } + } + + then { + assert workflow.failed + assert workflow.stderr.contains("* --pd_columns: the file or directory './no-exist' does not exist.") + } + } + + test("Test failure of pipeline when keeping no loci") { + tag "pipeline_keep_zero_loci" + + when { + params { + input = "$baseDir/tests/data/samplesheets/samplesheet-hash_missing.csv" + outdir = "results" + + gm_thresholds = "0" + pd_columns = "$baseDir/tests/data/columns/keep-zero-loci-empty-file.txt" + } + } + + then { + assert workflow.failed + } + } } From b08a816c6ced09fa2819e1ef3efcc0989642ac49 Mon Sep 17 00:00:00 2001 From: kylacochrane Date: Mon, 17 Jun 2024 17:02:25 -0400 Subject: [PATCH 110/119] Update documentation --- CHANGELOG.md | 13 +++++++++++-- CITATIONS.md | 12 ++++++++++++ LICENSE | 2 +- docs/output.md | 5 +++-- 4 files changed, 27 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9a15fda..0a0b949 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,8 +5,17 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## In-development -- Fixed nf-core tools linting failures introduced in version 2.12.1. -- Added phac-nml prefix to nf-core config +## 1.0.5 - 2024/06/17 + +- Updated modules to include: + + - `input_assure`: Performs a validation check on the samplesheet inputs to ensure that the sampleID precisely matches the MLST JSON key and enforces necessary changes where discrepancies are found. + - `cluster_file`: Generates the expected_clusters.txt file from reference sample addresses for use in GAS_call. + - `filter_query`: Filters and generates a csv file containing only the cluster addresses for query samples. + +- Pinned nf-iridanext plugin +- Added tests for the full pipeline, independant modules, and input parameters +- Updated documentation and configuration files ## 1.0.3 - 2024/02/23 diff --git a/CITATIONS.md b/CITATIONS.md index 84e1767..600a9e2 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -10,6 +10,18 @@ ## Pipeline tools +- [locidex](https://github.com/phac-nml/locidex) (in-development, citation subject to change) + + > Robertson, James, Wells, Matthew, Christy-Lynn, Peterson, Kyrylo Bessonov, Reimer, Aleisha, Schonfeld, Justin. LOCIDEX: Distributed allele calling engine. 2024. https://github.com/phac-nml/locidex + +- [profile_dists](https://github.com/phac-nml/profile_dists) (in-development, citation subject to change) + + > Robertson, James, Wells, Matthew, Schonfeld, Justin, Reimer, Aleisha. Profile Dists: Convenient package for comparing genetic similarity of samples based on allelic profiles. 2023. https://github.com/phac-nml/profile_dists + +- [genomic_address_service (GAS)](https://github.com/phac-nml/genomic_address_service) (in-development, citation subject to change) + + > Robertson, James, Wells, Matthew, Schonfeld, Justin, Reimer, Aleisha. Genomic Address Service: Convenient package for de novo clustering and sample assignment to existing clusters. 2023. https://github.com/phac-nml/genomic_address_service + ## Software packaging/containerisation tools - [Anaconda](https://anaconda.com) diff --git a/LICENSE b/LICENSE index ae9c66b..0ca6cdb 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ MIT License -Copyright (c) Aaron Petkau +Copyright (c) Government of Canada Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/docs/output.md b/docs/output.md index 4ad48e8..27a33c2 100644 --- a/docs/output.md +++ b/docs/output.md @@ -20,7 +20,7 @@ The IRIDA Next-compliant JSON output file will be named `iridanext.output.json.g The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes data using the following steps: -- [Input check](#input-check) - Performs a validation check on the samplesheet inputs to ensure that the sampleID precisely matches the MLST JSON key. +- [Input assure](#input-assure) - Performs a validation check on the samplesheet inputs to ensure that the sampleID precisely matches the MLST JSON key and enforces necessary changes where discrepancies are found. - [Locidex merge](#locidex-merge) - Merges MLST profile JSON files into a single profiles file for reference and query samples. - [Profile dists](#profile-dists) - Computes pairwise distances between genomes using MLST allele differences. - [Cluster file](#cluster-file) - Generates the expected_clusters.txt file from reference sample addresses for use in GAS_call. @@ -29,13 +29,14 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d - [IRIDA Next Output](#irida-next-output) - Generates a JSON output file that is compliant with IRIDA Next - [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution -### Input Check +### Input Assure
Output files - `input/` - `sampleID_error_report.csv` + - `sampleID.mlst.json.gz`
From cdbe9ef8b5216a38abaa77aa314d9ebb47a0c12f Mon Sep 17 00:00:00 2001 From: kylacochrane Date: Wed, 19 Jun 2024 14:20:13 -0400 Subject: [PATCH 111/119] Update CHANGELOG.md --- CHANGELOG.md | 34 +++++----------------------------- 1 file changed, 5 insertions(+), 29 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0a0b949..2ede076 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,39 +3,15 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## In-development +## 0.1.0 - 2024/06/18 -## 1.0.5 - 2024/06/17 - -- Updated modules to include: - - - `input_assure`: Performs a validation check on the samplesheet inputs to ensure that the sampleID precisely matches the MLST JSON key and enforces necessary changes where discrepancies are found. - - `cluster_file`: Generates the expected_clusters.txt file from reference sample addresses for use in GAS_call. - - `filter_query`: Filters and generates a csv file containing only the cluster addresses for query samples. - -- Pinned nf-iridanext plugin -- Added tests for the full pipeline, independant modules, and input parameters -- Updated documentation and configuration files - -## 1.0.3 - 2024/02/23 - -- Pinned nf-validation@1.1.3 plugin - -## 1.0.2 - 2023/12/18 - -- Removed GitHub workflows that weren't needed. -- Adding additional parameters for testing purposes. - -## 1.0.1 - 2023/12/06 - -Allowing non-gzipped FASTQ files as input. Default branch is now main. - -## 1.0.0 - 2023/11/30 - -Initial release of phac-nml/gasnomenclature, created with the [nf-core](https://nf-co.re/) template. +Initial release of the Genomic Address Nomenclature pipeline to be used to assign cluster addresses to samples based on an existing cluster designations. ### `Added` +- Input of cg/wgMLST allele calls produced from [locidex](https://github.com/phac-nml/locidex). +- Output of assigned cluster addresses for any **query** samples using [profile_dists](https://github.com/phac-nml/profile_dists) and [gas call](https://github.com/phac-nml/genomic_address_service). + ### `Fixed` ### `Dependencies` From 63e69b3e79c6accbb81411fc5a31ef5f6fd45d02 Mon Sep 17 00:00:00 2001 From: kylacochrane Date: Wed, 19 Jun 2024 14:22:50 -0400 Subject: [PATCH 112/119] Update workflow to allow gm_threshold to be single integers --- workflows/gas_nomenclature.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/gas_nomenclature.nf b/workflows/gas_nomenclature.nf index b0b05bb..f120586 100644 --- a/workflows/gas_nomenclature.nf +++ b/workflows/gas_nomenclature.nf @@ -125,7 +125,7 @@ workflow GAS_NOMENCLATURE { exit 1, "--gm_thresholds ${params.gm_thresholds}: Cannot pass null or empty string" } - gm_thresholds_list = params.gm_thresholds.split(',') + gm_thresholds_list = params.gm_thresholds.toString().split(',') if (params.pd_distm == 'hamming') { if (gm_thresholds_list.any { it != null && it.contains('.') }) { exit 1, ("'--pd_distm ${params.pd_distm}' is set, but '--gm_thresholds ${params.gm_thresholds}' contains fractions." From c57ef6c0f1a1817ec11e7782369a133e20e8adfd Mon Sep 17 00:00:00 2001 From: kylacochrane Date: Wed, 19 Jun 2024 16:29:45 -0400 Subject: [PATCH 113/119] Add success tests for different thresholds --- .../data/called/expected_results_thresh_1.txt | 5 ++ .../called/expected_results_thresh_1_0.txt | 5 ++ .../distances/expected_dists_thresh_1.txt | 5 ++ .../distances/expected_dists_thresh_1_0.txt | 5 ++ .../irida/thresh1.0_iridanext.output.json | 13 +++ .../data/irida/thresh1_iridanext.output.json | 13 +++ .../samplesheets/samplesheet_thresh_1.csv | 5 ++ .../samplesheets/samplesheet_thresh_1_0.csv | 5 ++ tests/pipelines/main_gm_threshold.nf.test | 88 +++++++++++++++++-- 9 files changed, 138 insertions(+), 6 deletions(-) create mode 100644 tests/data/called/expected_results_thresh_1.txt create mode 100644 tests/data/called/expected_results_thresh_1_0.txt create mode 100644 tests/data/distances/expected_dists_thresh_1.txt create mode 100644 tests/data/distances/expected_dists_thresh_1_0.txt create mode 100644 tests/data/irida/thresh1.0_iridanext.output.json create mode 100644 tests/data/irida/thresh1_iridanext.output.json create mode 100644 tests/data/samplesheets/samplesheet_thresh_1.csv create mode 100644 tests/data/samplesheets/samplesheet_thresh_1_0.csv diff --git a/tests/data/called/expected_results_thresh_1.txt b/tests/data/called/expected_results_thresh_1.txt new file mode 100644 index 0000000..165001a --- /dev/null +++ b/tests/data/called/expected_results_thresh_1.txt @@ -0,0 +1,5 @@ +id address level_1 +sample1 1 1 +sample2 1 1 +sample3 1 1 +sampleQ 1 1 diff --git a/tests/data/called/expected_results_thresh_1_0.txt b/tests/data/called/expected_results_thresh_1_0.txt new file mode 100644 index 0000000..c2ddc4f --- /dev/null +++ b/tests/data/called/expected_results_thresh_1_0.txt @@ -0,0 +1,5 @@ +id address level_1 level_2 +sample1 1.1 1 1 +sample2 1.1 1 1 +sample3 1.1 1 1 +sampleQ 1.2 1 2 diff --git a/tests/data/distances/expected_dists_thresh_1.txt b/tests/data/distances/expected_dists_thresh_1.txt new file mode 100644 index 0000000..84ea004 --- /dev/null +++ b/tests/data/distances/expected_dists_thresh_1.txt @@ -0,0 +1,5 @@ +query_id ref_id dist +sampleQ sampleQ 0 +sampleQ sample1 1 +sampleQ sample2 1 +sampleQ sample3 2 diff --git a/tests/data/distances/expected_dists_thresh_1_0.txt b/tests/data/distances/expected_dists_thresh_1_0.txt new file mode 100644 index 0000000..84ea004 --- /dev/null +++ b/tests/data/distances/expected_dists_thresh_1_0.txt @@ -0,0 +1,5 @@ +query_id ref_id dist +sampleQ sampleQ 0 +sampleQ sample1 1 +sampleQ sample2 1 +sampleQ sample3 2 diff --git a/tests/data/irida/thresh1.0_iridanext.output.json b/tests/data/irida/thresh1.0_iridanext.output.json new file mode 100644 index 0000000..d85169a --- /dev/null +++ b/tests/data/irida/thresh1.0_iridanext.output.json @@ -0,0 +1,13 @@ +{ + "files": { + "global": [], + "samples": {} + }, + "metadata": { + "samples": { + "sampleQ": { + "address": "1.2" + } + } + } +} diff --git a/tests/data/irida/thresh1_iridanext.output.json b/tests/data/irida/thresh1_iridanext.output.json new file mode 100644 index 0000000..2f0745e --- /dev/null +++ b/tests/data/irida/thresh1_iridanext.output.json @@ -0,0 +1,13 @@ +{ + "files": { + "global": [], + "samples": {} + }, + "metadata": { + "samples": { + "sampleQ": { + "address": "1" + } + } + } +} diff --git a/tests/data/samplesheets/samplesheet_thresh_1.csv b/tests/data/samplesheets/samplesheet_thresh_1.csv new file mode 100644 index 0000000..f4b6b93 --- /dev/null +++ b/tests/data/samplesheets/samplesheet_thresh_1.csv @@ -0,0 +1,5 @@ +sample,mlst_alleles,address +sampleQ,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sampleQ.mlst.json, +sample1,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample1.mlst.json,1 +sample2,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample2.mlst.json,1 +sample3,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample3.mlst.json,1 diff --git a/tests/data/samplesheets/samplesheet_thresh_1_0.csv b/tests/data/samplesheets/samplesheet_thresh_1_0.csv new file mode 100644 index 0000000..9260f3f --- /dev/null +++ b/tests/data/samplesheets/samplesheet_thresh_1_0.csv @@ -0,0 +1,5 @@ +sample,mlst_alleles,address +sampleQ,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sampleQ.mlst.json, +sample1,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample1.mlst.json,1.1 +sample2,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample2.mlst.json,1.1 +sample3,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample3.mlst.json,1.1 diff --git a/tests/pipelines/main_gm_threshold.nf.test b/tests/pipelines/main_gm_threshold.nf.test index a060db7..dd4e2fa 100644 --- a/tests/pipelines/main_gm_threshold.nf.test +++ b/tests/pipelines/main_gm_threshold.nf.test @@ -97,12 +97,12 @@ nextflow_pipeline { } } - test("Test pipeline with single threshold set to 1") { - tag "pipeline_thresh_1" + test("Test fail pipeline with single threshold set to 1") { + tag "pipeline_thresh_1_fail" when { params { - input = "$baseDir/tests/data/samplesheets/samplesheet1.csv" + input = "$baseDir/tests/data/samplesheets/samplesheet_thresh_1.csv" outdir = "results" gm_thresholds = "1" @@ -115,12 +115,50 @@ nextflow_pipeline { } } - test("Test pipeline with threshold set to 1,0") { - tag "pipeline_thresh_1.0" + test("Test pipeline with single threshold set to 1") { + tag "pipeline_thresh_1_success" when { params { - input = "$baseDir/tests/data/samplesheets/samplesheet1.csv" + input = "$baseDir/tests/data/samplesheets/samplesheet_thresh_1.csv" + outdir = "results" + + gm_thresholds = "1" + } + } + + then { + assert workflow.success + assert path("$launchDir/results").exists() + + // Check computed pairwise distances + def actual_distances = path("$launchDir/results/distances/results.text") + def expected_distances = path("$baseDir/tests/data/distances/expected_dists_thresh_1.txt") + assert actual_distances.text == expected_distances.text + + // Check called clusters + def actual_calls = path("$launchDir/results/call/Called/results.text") + def expected_calls = path("$baseDir/tests/data/called/expected_results_thresh_1.txt") + assert actual_calls.text == expected_calls.text + + // Check IRIDA Next JSON output + assert path("$launchDir/results/iridanext.output.json").json == path("$baseDir/tests/data/irida/thresh1_iridanext.output.json").json + + def iridanext_json = path("$launchDir/results/iridanext.output.json").json + def iridanext_samples = iridanext_json.files.samples + def iridanext_metadata = iridanext_json.metadata.samples + + assert iridanext_metadata.size() == 1 && iridanext_metadata.containsKey("sampleQ") + assert iridanext_metadata.sampleQ."address" == "1" + } + } + + test("Test fail pipeline with threshold set to 1,0") { + tag "pipeline_thresh_1_0_fail" + + when { + params { + input = "$baseDir/tests/data/samplesheets/samplesheet_thresh_1_0.csv" outdir = "results" gm_thresholds = "1,0" @@ -132,4 +170,42 @@ nextflow_pipeline { assert (workflow.stdout =~ /Error \[1.0, 0.0\] supplied thresholds do not equal the number of threshold columns in reference_clusters.txt/).find() } } + + test("Test pipeline with threshold set to 1,0") { + tag "pipeline_thresh_1_0_success" + + when { + params { + input = "$baseDir/tests/data/samplesheets/samplesheet_thresh_1_0.csv" + outdir = "results" + + gm_thresholds = "1,0" + } + } + + then { + assert workflow.success + assert path("$launchDir/results").exists() + + // Check computed pairwise distances + def actual_distances = path("$launchDir/results/distances/results.text") + def expected_distances = path("$baseDir/tests/data/distances/expected_dists_thresh_1_0.txt") + assert actual_distances.text == expected_distances.text + + // Check called clusters + def actual_calls = path("$launchDir/results/call/Called/results.text") + def expected_calls = path("$baseDir/tests/data/called/expected_results_thresh_1_0.txt") + assert actual_calls.text == expected_calls.text + + // Check IRIDA Next JSON output + assert path("$launchDir/results/iridanext.output.json").json == path("$baseDir/tests/data/irida/thresh1.0_iridanext.output.json").json + + def iridanext_json = path("$launchDir/results/iridanext.output.json").json + def iridanext_samples = iridanext_json.files.samples + def iridanext_metadata = iridanext_json.metadata.samples + + assert iridanext_metadata.size() == 1 && iridanext_metadata.containsKey("sampleQ") + assert iridanext_metadata.sampleQ."address" == "1.2" + } + } } From cc8b033afef9e79c5d223938cf24d4b41fa82b2b Mon Sep 17 00:00:00 2001 From: kylacochrane Date: Wed, 19 Jun 2024 16:42:48 -0400 Subject: [PATCH 114/119] Fix error in main_gm_threshold.nf.test --- tests/pipelines/main_gm_threshold.nf.test | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/pipelines/main_gm_threshold.nf.test b/tests/pipelines/main_gm_threshold.nf.test index dd4e2fa..6b7e838 100644 --- a/tests/pipelines/main_gm_threshold.nf.test +++ b/tests/pipelines/main_gm_threshold.nf.test @@ -102,7 +102,7 @@ nextflow_pipeline { when { params { - input = "$baseDir/tests/data/samplesheets/samplesheet_thresh_1.csv" + input = "$baseDir/tests/data/samplesheets/samplesheet1.csv" outdir = "results" gm_thresholds = "1" @@ -158,7 +158,7 @@ nextflow_pipeline { when { params { - input = "$baseDir/tests/data/samplesheets/samplesheet_thresh_1_0.csv" + input = "$baseDir/tests/data/samplesheets/samplesheet1.csv" outdir = "results" gm_thresholds = "1,0" From 1ed30b854053014d8b234ffb813421babfd030c7 Mon Sep 17 00:00:00 2001 From: kylacochrane Date: Mon, 24 Jun 2024 09:22:20 -0400 Subject: [PATCH 115/119] Updating tests and documentation to correct 'scaled' distances --- README.md | 22 ++++++++-- tests/pipelines/main.nf.test | 50 +++++++++++++++++++++++ tests/pipelines/main_gm_threshold.nf.test | 4 +- workflows/gas_nomenclature.nf | 4 +- 4 files changed, 73 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 4aa8b43..696b275 100644 --- a/README.md +++ b/README.md @@ -24,11 +24,27 @@ Details on the columns can be found in the [Full samplesheet](docs/usage.md#full The main parameters are `--input` as defined above and `--output` for specifying the output results directory. You may wish to provide `-profile singularity` to specify the use of singularity containers and `-r [branch]` to specify which GitHub branch you would like to run. -## Profile dists +## Distance Method and Thresholds + +Profile_Dists and the Genomic Address Service workflows can use two distance methods: hamming or scaled. + +### Hamming Distances + +Hamming distances are integers representing the number of differing loci between two sequences and will range between [0, n], where `n` is the total number of loci. When using Hamming distances, you must specify `--pd_distm hamming` and provide Hamming distance thresholds as integers between [0, n]: `--gm_thresholds "10,5,0"` (10, 5, and 0 loci). + +### Scaled Distances + +Scaled distances are floats representing the percentage of differing loci between two sequences and will range between [0.0, 100.0]. When using scaled distances, you must specify `--pd_distm scaled` and provide percentages between [0.0, 100.0] as thresholds: `--gm_thresholds "50,20,0"` (50%, 20%, and 0% of loci). + +### Thresholds + +The `--gm_thresholds` parameter is used to set thresholds for each cluster level, which in turn are used to assign cluster addresses at each level. When specifying `--pd_distm hamming` and `--gm_thresholds "10,5,0"`, all sequences that have no more than 10 loci differences will be assigned the same cluster code for the first level, no more than 5 for the second level, and only sequences that have no loci differences will be assigned the same cluster code for the third level. + +## Profile_dists The following can be used to adjust parameters for the [profile_dists][] tool. -- `--pd_distm`: The distance method/unit, either _hamming_ or _scaled_. For _hamming_ distances, the distance values will be a non-negative integer. For _scaled_ distances, the distance values are between 0 and 1. +- `--pd_distm`: The distance method/unit, either _hamming_ or _scaled_. For _hamming_ distances, the distance values will be a non-negative integer. For _scaled_ distances, the distance values are between 0.0 and 100.0. Please see the [Distance Method and Thresholds](#distance-method-and-thresholds) section for more information. - `--pd_missing_threshold`: The maximum proportion of missing data per locus for a locus to be kept in the analysis. Values from 0 to 1. - `--pd_sample_quality_threshold`: The maximum proportion of missing data per sample for a sample to be kept in the analysis. Values from 0 to 1. - `--pd_file_type`: Output format file type. One of _text_ or _parquet_. @@ -47,7 +63,7 @@ The following can be used to adjust parameters for the [profile_dists][] tool. The following can be used to adjust parameters for the [gas call][] tool. -- `--gm_thresholds`: Thresholds delimited by `,`. Values should match units from `--pd_distm` (either _hamming_ or _scaled_). +- `--gm_thresholds`: Thresholds delimited by `,`. Values should match units from `--pd_distm` (either _hamming_ or _scaled_). Please see the [Distance Method and Thresholds](#distance-method-and-thresholds) section for more information. - `--gm_method`: The linkage method to use for clustering. Value should be one of _single_, _average_, or _complete_. - `--gm_delimiter`: Delimiter desired for nomenclature code. Must be alphanumeric or one of `._-`. diff --git a/tests/pipelines/main.nf.test b/tests/pipelines/main.nf.test index 6716dae..223b7de 100644 --- a/tests/pipelines/main.nf.test +++ b/tests/pipelines/main.nf.test @@ -54,6 +54,56 @@ nextflow_pipeline { } } + test("Small-scale test of full pipeline with scaled distances"){ + tag "pipeline_success_scaled" + + when{ + params { + input = "$baseDir/tests/data/samplesheets/samplesheet1.csv" + outdir = "results" + + pd_distm = "scaled" + gm_thresholds = "50,20,0" + } + } + + then { + assert workflow.success + assert path("$launchDir/results").exists() + + // Check merged profiles + def actual_profile_ref = path("$launchDir/results/locidex/merge/reference/merged_ref/merged_profiles_ref.tsv") + def expected_profile_tsv = path("$baseDir/tests/data/profiles/expected-profile_scaled1.tsv") + assert actual_profile_ref.text == expected_profile_tsv.text + + // Check query profiles + def actual_profile_query = path("$launchDir/results/locidex/merge/query/merged_value/merged_profiles_value.tsv") + def expected_profile_query_tsv = path("$baseDir/tests/data/profiles/expected-profile_scaled2.tsv") + assert actual_profile_query.text == expected_profile_query_tsv.text + + // Check computed pairwise distances + def actual_distances = path("$launchDir/results/distances/results.text") + def expected_distances = path("$baseDir/tests/data/distances/expected_dists_scaled.txt") + assert actual_distances.text == expected_distances.text + + // Check called clusters + def actual_calls = path("$launchDir/results/call/Called/results.text") + def expected_calls = path("$baseDir/tests/data/called/expected_results_scaled.txt") + assert actual_calls.text == expected_calls.text + + // Check IRIDA Next JSON output + assert path("$launchDir/results/iridanext.output.json").json == path("$baseDir/tests/data/irida/scaled_iridanext.output.json").json + + def iridanext_json = path("$launchDir/results/iridanext.output.json").json + def iridanext_samples = iridanext_json.files.samples + def iridanext_metadata = iridanext_json.metadata.samples + + assert iridanext_metadata.size() == 1 && iridanext_metadata.containsKey("sampleQ") + assert iridanext_metadata.sampleQ."address" == "1.2.3" + } + } + + test("Small-scale test of full pipeline with multiple queries"){ tag "pipeline_success_multiple_queries" diff --git a/tests/pipelines/main_gm_threshold.nf.test b/tests/pipelines/main_gm_threshold.nf.test index 6b7e838..fb76112 100644 --- a/tests/pipelines/main_gm_threshold.nf.test +++ b/tests/pipelines/main_gm_threshold.nf.test @@ -65,14 +65,14 @@ nextflow_pipeline { input = "$baseDir/tests/data/samplesheets/samplesheet1.csv" outdir = "results" - gm_thresholds = "1,0.5,2" + gm_thresholds = "200,50,0" pd_distm = "scaled" } } then { assert workflow.failed - assert workflow.stdout.contains("ERROR ~ '--pd_distm scaled' is set, but '--gm_thresholds 1,0.5,2' contains thresholds outside of range [0,1]." + assert workflow.stdout.contains("ERROR ~ '--pd_distm scaled' is set, but '--gm_thresholds 200,50,0' contains thresholds outside of range [0,100]." + " Please either set '--pd_distm hamming' or adjust the threshold values.") } } diff --git a/workflows/gas_nomenclature.nf b/workflows/gas_nomenclature.nf index f120586..8972669 100644 --- a/workflows/gas_nomenclature.nf +++ b/workflows/gas_nomenclature.nf @@ -132,8 +132,8 @@ workflow GAS_NOMENCLATURE { + " Please either set '--pd_distm scaled' or remove fractions from distance thresholds.") } } else if (params.pd_distm == 'scaled') { - if (gm_thresholds_list.any { it != null && (it as Float < 0 || it as Float > 1) }) { - exit 1, ("'--pd_distm ${params.pd_distm}' is set, but '--gm_thresholds ${params.gm_thresholds}' contains thresholds outside of range [0,1]." + if (gm_thresholds_list.any { it != null && (it as Float < 0.0 || it as Float > 100.0) }) { + exit 1, ("'--pd_distm ${params.pd_distm}' is set, but '--gm_thresholds ${params.gm_thresholds}' contains thresholds outside of range [0,100]." + " Please either set '--pd_distm hamming' or adjust the threshold values.") } } else { From 00a97c924908b4104dc48d7c7fa906896bd64e23 Mon Sep 17 00:00:00 2001 From: kylacochrane Date: Mon, 24 Jun 2024 09:55:50 -0400 Subject: [PATCH 116/119] Added the expected test data files for scaled distances --- tests/data/called/expected_results_scaled.txt | 5 +++++ tests/data/distances/expected_dists_scaled.txt | 5 +++++ tests/data/irida/scaled_iridanext.output.json | 13 +++++++++++++ tests/data/profiles/expected-profile_scaled1.tsv | 5 +++++ tests/data/profiles/expected-profile_scaled2.tsv | 2 ++ 5 files changed, 30 insertions(+) create mode 100644 tests/data/called/expected_results_scaled.txt create mode 100644 tests/data/distances/expected_dists_scaled.txt create mode 100644 tests/data/irida/scaled_iridanext.output.json create mode 100644 tests/data/profiles/expected-profile_scaled1.tsv create mode 100644 tests/data/profiles/expected-profile_scaled2.tsv diff --git a/tests/data/called/expected_results_scaled.txt b/tests/data/called/expected_results_scaled.txt new file mode 100644 index 0000000..bd70a8e --- /dev/null +++ b/tests/data/called/expected_results_scaled.txt @@ -0,0 +1,5 @@ +id address level_1 level_2 level_3 +sample1 1.1.1 1 1 1 +sample2 1.1.1 1 1 1 +sample3 1.1.2 1 1 2 +sampleQ 1.2.3 1 2 3 diff --git a/tests/data/distances/expected_dists_scaled.txt b/tests/data/distances/expected_dists_scaled.txt new file mode 100644 index 0000000..cd51991 --- /dev/null +++ b/tests/data/distances/expected_dists_scaled.txt @@ -0,0 +1,5 @@ +query_id ref_id dist +sampleQ sampleQ 0.0 +sampleQ sample1 33.333333333333336 +sampleQ sample2 33.333333333333336 +sampleQ sample3 66.66666666666667 diff --git a/tests/data/irida/scaled_iridanext.output.json b/tests/data/irida/scaled_iridanext.output.json new file mode 100644 index 0000000..3121e6a --- /dev/null +++ b/tests/data/irida/scaled_iridanext.output.json @@ -0,0 +1,13 @@ +{ + "files": { + "global": [], + "samples": {} + }, + "metadata": { + "samples": { + "sampleQ": { + "address": "1.2.3" + } + } + } +} diff --git a/tests/data/profiles/expected-profile_scaled1.tsv b/tests/data/profiles/expected-profile_scaled1.tsv new file mode 100644 index 0000000..6d02526 --- /dev/null +++ b/tests/data/profiles/expected-profile_scaled1.tsv @@ -0,0 +1,5 @@ +sample_id l1 l2 l3 +sampleQ 1 2 1 +sample1 1 1 1 +sample2 1 1 1 +sample3 1 1 2 diff --git a/tests/data/profiles/expected-profile_scaled2.tsv b/tests/data/profiles/expected-profile_scaled2.tsv new file mode 100644 index 0000000..44020cb --- /dev/null +++ b/tests/data/profiles/expected-profile_scaled2.tsv @@ -0,0 +1,2 @@ +sample_id l1 l2 l3 +sampleQ 1 2 1 From d2e2c801f60a9a32d974d4d5e2a6af7164a508ac Mon Sep 17 00:00:00 2001 From: kylacochrane Date: Wed, 26 Jun 2024 17:32:43 -0400 Subject: [PATCH 117/119] Updated README to expand on GAS thresholds and linkage methods --- README.md | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 696b275..499f513 100644 --- a/README.md +++ b/README.md @@ -36,9 +36,15 @@ Hamming distances are integers representing the number of differing loci between Scaled distances are floats representing the percentage of differing loci between two sequences and will range between [0.0, 100.0]. When using scaled distances, you must specify `--pd_distm scaled` and provide percentages between [0.0, 100.0] as thresholds: `--gm_thresholds "50,20,0"` (50%, 20%, and 0% of loci). -### Thresholds +### Thresholds and Linkage Methods -The `--gm_thresholds` parameter is used to set thresholds for each cluster level, which in turn are used to assign cluster addresses at each level. When specifying `--pd_distm hamming` and `--gm_thresholds "10,5,0"`, all sequences that have no more than 10 loci differences will be assigned the same cluster code for the first level, no more than 5 for the second level, and only sequences that have no loci differences will be assigned the same cluster code for the third level. +The `--gm_thresholds` parameter sets thresholds for each cluster level, which dictate how sequences are assigned cluster codes. These thresholds specify the maximum allowable differences in loci between sequences sharing the same cluster code at each level. The consistency of these thresholds in ensuring uniform cluster codes across levels depends on the `--gm_method` parameter, which determines the linkage method used for clustering. + +- _Complete Linkage_: When using complete linkage clustering, sequences are grouped such that identical cluster codes at a particular level guarantee that all sequences in that cluster are within the specified threshold distance. For example, specifying `--pd_distm hamming` and `--gm_thresholds "10,5,0"` would mean that sequences with no more than 10 loci differences are assigned the same cluster code at the first level, no more than 5 differences at the second level, and identical sequences at the third level. + +- _Average Linkage_: With average linkage clustering, sequences may share the same cluster code if their average distance is below the specified threshold. For instance, sequences with average distances less than 10, 5, and 0 for each level respectively may share the same cluster code. + +- _Single Linkage_: Single linkage clustering can result in merging distant samples into the same cluster if there exists a third sample that bridges the distance between them. This method does not provide strict guarantees on the maximum distance within a cluster, potentially allowing distant sequences to share the same cluster code. ## Profile_dists From df7c748897a5f1bec6df375e38243bbfcae0a161 Mon Sep 17 00:00:00 2001 From: kylacochrane Date: Thu, 27 Jun 2024 13:52:06 -0400 Subject: [PATCH 118/119] Updating docs for release --- CHANGELOG.md | 2 +- nextflow.config | 2 +- tests/data/samplesheets/samplesheet-hash_missing.csv | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2ede076..fb91fcd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,7 +3,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## 0.1.0 - 2024/06/18 +## 0.1.0 - 2024/06/28 Initial release of the Genomic Address Nomenclature pipeline to be used to assign cluster addresses to samples based on an existing cluster designations. diff --git a/nextflow.config b/nextflow.config index 9734b66..423d59c 100644 --- a/nextflow.config +++ b/nextflow.config @@ -219,7 +219,7 @@ manifest { description = """Gas Nomenclature assignment pipeline""" mainScript = 'main.nf' nextflowVersion = '!>=23.04.0' - version = 'v0.0.1dev' + version = '0.1.0' doi = '' defaultBranch = 'main' } diff --git a/tests/data/samplesheets/samplesheet-hash_missing.csv b/tests/data/samplesheets/samplesheet-hash_missing.csv index bce4982..7bfe7af 100644 --- a/tests/data/samplesheets/samplesheet-hash_missing.csv +++ b/tests/data/samplesheets/samplesheet-hash_missing.csv @@ -1,5 +1,5 @@ sample,mlst_alleles,address sampleQ,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sampleQ.mlst.json, sample1,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample1.mlst.json,1 -sample2,https://raw.githubusercontent.com/phac-nml/gasnomenclature/add_tests/tests/data/reports/sample2_missing.mlst.json,1 -sample3,https://raw.githubusercontent.com/phac-nml/gasnomenclature/add_tests/tests/data/reports/sample3_missing.mlst.json,2 +sample2,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample2_missing.mlst.json,1 +sample3,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample3_missing.mlst.json,2 From 32b88c50ab20815f1d61b030d41d51bed405f824 Mon Sep 17 00:00:00 2001 From: kylacochrane Date: Thu, 27 Jun 2024 14:05:09 -0400 Subject: [PATCH 119/119] Add link to release to CHANGELOG.md --- CHANGELOG.md | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index fb91fcd..4b7a138 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,7 +3,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## 0.1.0 - 2024/06/28 +## [0.1.0] - 2024/06/28 Initial release of the Genomic Address Nomenclature pipeline to be used to assign cluster addresses to samples based on an existing cluster designations. @@ -12,8 +12,4 @@ Initial release of the Genomic Address Nomenclature pipeline to be used to assig - Input of cg/wgMLST allele calls produced from [locidex](https://github.com/phac-nml/locidex). - Output of assigned cluster addresses for any **query** samples using [profile_dists](https://github.com/phac-nml/profile_dists) and [gas call](https://github.com/phac-nml/genomic_address_service). -### `Fixed` - -### `Dependencies` - -### `Deprecated` +[0.1.0]: https://github.com/phac-nml/gasnomenclature/releases/tag/0.1.0