From 6d76d501cc0f8a5031c5ce134f9e82b6f040cb81 Mon Sep 17 00:00:00 2001 From: kylacochrane Date: Tue, 11 Jun 2024 11:55:15 -0400 Subject: [PATCH 1/6] Update filter_query to accomodate multiple queries --- modules/local/filter_query/main.nf | 12 +++++++----- workflows/gas_nomenclature.nf | 4 +++- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/modules/local/filter_query/main.nf b/modules/local/filter_query/main.nf index 5bb4b17..9912ee5 100644 --- a/modules/local/filter_query/main.nf +++ b/modules/local/filter_query/main.nf @@ -7,7 +7,7 @@ process FILTER_QUERY { 'biocontainers/csvtk:0.22.0--h9ee0642_1' }" input: - val input_query + val query_ids path addresses val in_format val out_format @@ -17,19 +17,19 @@ process FILTER_QUERY { path("versions.yml"), emit: versions script: - - def queryID = input_query[0].id def outputFile = "new_addresses" - def delimiter = in_format == "tsv" ? "\t" : (in_format == "csv" ? "," : in_format) def out_delimiter = out_format == "tsv" ? "\t" : (out_format == "csv" ? "," : out_format) def out_extension = out_format == "tsv" ? 'tsv' : 'csv' + // Join the query IDs in the correct csvtk filter2 required format + def queryID = query_ids.collect { id -> "\$id == \"${id}\"" }.join(" || ") + """ # Filter the query samples only; keep only the 'id' and 'address' columns csvtk filter2 \\ ${addresses} \\ - --filter '\$id == \"$queryID\"' \\ + --filter '$queryID' \\ --delimiter "${delimiter}" \\ --out-delimiter "${out_delimiter}" | \\ csvtk cut -f id,address > ${outputFile}.${out_extension} @@ -39,5 +39,7 @@ process FILTER_QUERY { csvtk: \$(echo \$( csvtk version | sed -e "s/csvtk v//g" )) END_VERSIONS """ + + } diff --git a/workflows/gas_nomenclature.nf b/workflows/gas_nomenclature.nf index 813de21..9a8913f 100644 --- a/workflows/gas_nomenclature.nf +++ b/workflows/gas_nomenclature.nf @@ -142,7 +142,9 @@ workflow GAS_NOMENCLATURE { ch_versions = ch_versions.mix(called_data.versions) // Filter the new queried samples and addresses into a CSV/JSON file for the IRIDANext plug in - new_addresses = FILTER_QUERY(profiles.query, called_data.distances, "tsv", "csv") + query_ids = profiles.query.collect { it[0].id } + + new_addresses = FILTER_QUERY(query_ids, called_data.distances, "tsv", "csv") ch_versions = ch_versions.mix(new_addresses.versions) CUSTOM_DUMPSOFTWAREVERSIONS ( From 59f4d176dedfdad7e144b48e30cc5a4249159a5a Mon Sep 17 00:00:00 2001 From: kylacochrane Date: Tue, 11 Jun 2024 11:57:59 -0400 Subject: [PATCH 2/6] Add test for multiple_queries and supporting files --- .../data/called/expected_results_queries.txt | 6 ++ .../data/irida/queries_iridanext.output.json | 16 +++++ .../profiles/expected-profile_queries1.tsv | 6 ++ .../profiles/expected-profile_queries2.tsv | 3 + tests/data/reports/sampleN.mlst.json | 7 +++ .../samplesheet-multiple_queries.csv | 6 ++ tests/pipelines/main.nf.test | 60 +++++++++++++++++++ 7 files changed, 104 insertions(+) create mode 100644 tests/data/called/expected_results_queries.txt create mode 100644 tests/data/irida/queries_iridanext.output.json create mode 100644 tests/data/profiles/expected-profile_queries1.tsv create mode 100644 tests/data/profiles/expected-profile_queries2.tsv create mode 100644 tests/data/reports/sampleN.mlst.json create mode 100644 tests/data/samplesheets/samplesheet-multiple_queries.csv diff --git a/tests/data/called/expected_results_queries.txt b/tests/data/called/expected_results_queries.txt new file mode 100644 index 0000000..f5e5ae4 --- /dev/null +++ b/tests/data/called/expected_results_queries.txt @@ -0,0 +1,6 @@ +id address level_1 level_2 level_3 +sample1 1.1.1 1 1 1 +sample2 1.1.1 1 1 1 +sample3 1.1.2 1 1 2 +sampleQ 2.2.3 2 2 3 +sampleN 2.2.3 2 2 3 diff --git a/tests/data/irida/queries_iridanext.output.json b/tests/data/irida/queries_iridanext.output.json new file mode 100644 index 0000000..7063e8e --- /dev/null +++ b/tests/data/irida/queries_iridanext.output.json @@ -0,0 +1,16 @@ +{ + "files": { + "global": [], + "samples": {} + }, + "metadata": { + "samples": { + "sampleQ": { + "address": "2.2.3" + }, + "sampleN": { + "address": "2.2.3" + } + } + } +} diff --git a/tests/data/profiles/expected-profile_queries1.tsv b/tests/data/profiles/expected-profile_queries1.tsv new file mode 100644 index 0000000..b2f8100 --- /dev/null +++ b/tests/data/profiles/expected-profile_queries1.tsv @@ -0,0 +1,6 @@ +sample_id l1 l2 l3 +sampleQ 1 2 1 +sampleN 1 2 1 +sample1 1 1 1 +sample2 1 1 1 +sample3 1 1 2 diff --git a/tests/data/profiles/expected-profile_queries2.tsv b/tests/data/profiles/expected-profile_queries2.tsv new file mode 100644 index 0000000..4b4d059 --- /dev/null +++ b/tests/data/profiles/expected-profile_queries2.tsv @@ -0,0 +1,3 @@ +sample_id l1 l2 l3 +sampleQ 1 2 1 +sampleN 1 2 1 diff --git a/tests/data/reports/sampleN.mlst.json b/tests/data/reports/sampleN.mlst.json new file mode 100644 index 0000000..178b6db --- /dev/null +++ b/tests/data/reports/sampleN.mlst.json @@ -0,0 +1,7 @@ +{ + "sampleN": { + "l1": "1", + "l2": "2", + "l3": "1" + } +} diff --git a/tests/data/samplesheets/samplesheet-multiple_queries.csv b/tests/data/samplesheets/samplesheet-multiple_queries.csv new file mode 100644 index 0000000..eb661ca --- /dev/null +++ b/tests/data/samplesheets/samplesheet-multiple_queries.csv @@ -0,0 +1,6 @@ +sample,mlst_alleles,address +sampleQ,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sampleQ.mlst.json, +sampleN,/root/working_directory/nml-phac/gasnomenclature/tests/data/reports/sampleN.mlst.json, +sample1,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample1.mlst.json,1.1.1 +sample2,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample2.mlst.json,1.1.1 +sample3,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample3.mlst.json,1.1.2 diff --git a/tests/pipelines/main.nf.test b/tests/pipelines/main.nf.test index e35a9c3..e3a467a 100644 --- a/tests/pipelines/main.nf.test +++ b/tests/pipelines/main.nf.test @@ -32,6 +32,11 @@ nextflow_pipeline { def expected_distances = path("$baseDir/tests/data/distances/expected_pairwise_dists.txt") assert actual_distances.text == expected_distances.text + // Verify cluster file + def actual_cluster = path("$launchDir/results/cluster/expected_clusters.txt") + def expected_cluster = path("$baseDir/tests/data/clusters/expected_clusters.txt") + assert actual_cluster.text == expected_cluster.text + // Check called clusters def actual_calls = path("$launchDir/results/call/Called/results.text") def expected_calls = path("$baseDir/tests/data/called/expected_results.txt") @@ -49,6 +54,61 @@ nextflow_pipeline { } } + test("Small-scale test of full pipeline with multiple queries"){ + tag "pipeline_success_multiple_queries" + + when{ + params { + input = "$baseDir/tests/data/samplesheets/samplesheet-multiple_queries.csv" + outdir = "results" + } + } + + then { + assert workflow.success + assert path("$launchDir/results").exists() + + // Check merged profiles + def actual_profile_ref = path("$launchDir/results/locidex/merge/reference/merged_ref/merged_profiles_ref.tsv") + def expected_profile_tsv = path("$baseDir/tests/data/profiles/expected-profile_queries1.tsv") + assert actual_profile_ref.text == expected_profile_tsv.text + + // Check query profiles + def actual_profile_query = path("$launchDir/results/locidex/merge/query/merged_value/merged_profiles_value.tsv") + def expected_profile_query_tsv = path("$baseDir/tests/data/profiles/expected-profile_queries2.tsv") + assert actual_profile_query.text == expected_profile_query_tsv.text + + // Check computed pairwise distances + def actual_distances = path("$launchDir/results/distances/results.text") + def expected_distances = path("$baseDir/tests/data/distances/expected_pairwise_queries_dists.txt") + assert actual_distances.text == expected_distances.text + + // Verify cluster file + def actual_cluster = path("$launchDir/results/cluster/expected_clusters.txt") + def expected_cluster = path("$baseDir/tests/data/clusters/expected_clusters.txt") + assert actual_cluster.text == expected_cluster.text + + // Check called clusters + def actual_calls = path("$launchDir/results/call/Called/results.text") + def expected_calls = path("$baseDir/tests/data/called/expected_results_queries.txt") + assert actual_calls.text == expected_calls.text + + // Check IRIDA Next JSON output + assert path("$launchDir/results/iridanext.output.json").json == path("$baseDir/tests/data/irida/queries_iridanext.output.json").json + + def iridanext_json = path("$launchDir/results/iridanext.output.json").json + def iridanext_samples = iridanext_json.files.samples + def iridanext_metadata = iridanext_json.metadata.samples + + assert iridanext_metadata.size() == 2 + assert iridanext_metadata.containsKey("sampleQ") + assert iridanext_metadata.containsKey("sampleN") + + assert iridanext_metadata.sampleQ."address" == "2.2.3" + assert iridanext_metadata.sampleN.address == "2.2.3" + } + } + test("Integration test where input contains reference sample with mismatched MLST JSON file"){ tag "pipeline_failure" From 0742c50e81c6296209215ea8044061d65101f802 Mon Sep 17 00:00:00 2001 From: kylacochrane Date: Tue, 11 Jun 2024 11:58:43 -0400 Subject: [PATCH 3/6] Implement 'fair true' in input_check to insure consistent ordering of samples --- modules/local/input_check/main.nf | 1 + 1 file changed, 1 insertion(+) diff --git a/modules/local/input_check/main.nf b/modules/local/input_check/main.nf index 79a2242..762aeae 100644 --- a/modules/local/input_check/main.nf +++ b/modules/local/input_check/main.nf @@ -1,6 +1,7 @@ process INPUT_CHECK{ tag "Check Sample Inputs and Generate Error Report" label 'process_single' + fair true container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/python:3.8.3' : From d1c9809e48e9e61431ce379e4f64c951737d8bc4 Mon Sep 17 00:00:00 2001 From: kylacochrane Date: Tue, 11 Jun 2024 13:09:56 -0400 Subject: [PATCH 4/6] Update file path in samplesheet --- tests/data/samplesheets/samplesheet-multiple_queries.csv | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/data/samplesheets/samplesheet-multiple_queries.csv b/tests/data/samplesheets/samplesheet-multiple_queries.csv index eb661ca..e429a1c 100644 --- a/tests/data/samplesheets/samplesheet-multiple_queries.csv +++ b/tests/data/samplesheets/samplesheet-multiple_queries.csv @@ -1,6 +1,6 @@ sample,mlst_alleles,address sampleQ,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sampleQ.mlst.json, -sampleN,/root/working_directory/nml-phac/gasnomenclature/tests/data/reports/sampleN.mlst.json, +sampleN,https://raw.githubusercontent.com/phac-nml/gasnomenclature/update-filter_query/tests/data/reports/sampleN.mlst.json, sample1,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample1.mlst.json,1.1.1 sample2,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample2.mlst.json,1.1.1 sample3,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample3.mlst.json,1.1.2 From 07b6c3c87f8f5465a98b8a35aa05a7e534e5a8b2 Mon Sep 17 00:00:00 2001 From: kylacochrane Date: Tue, 11 Jun 2024 14:02:40 -0400 Subject: [PATCH 5/6] Added missing expected test file --- .../distances/expected_pairwise_queries_dists.txt | 11 +++++++++++ 1 file changed, 11 insertions(+) create mode 100644 tests/data/distances/expected_pairwise_queries_dists.txt diff --git a/tests/data/distances/expected_pairwise_queries_dists.txt b/tests/data/distances/expected_pairwise_queries_dists.txt new file mode 100644 index 0000000..44aa848 --- /dev/null +++ b/tests/data/distances/expected_pairwise_queries_dists.txt @@ -0,0 +1,11 @@ +query_id ref_id dist +sampleQ sampleQ 0 +sampleQ sampleN 0 +sampleQ sample1 1 +sampleQ sample2 1 +sampleQ sample3 2 +sampleN sampleQ 0 +sampleN sampleN 0 +sampleN sample1 1 +sampleN sample2 1 +sampleN sample3 2 From 15b7090691f90dbb969be9db01218715673201ca Mon Sep 17 00:00:00 2001 From: kylacochrane Date: Wed, 12 Jun 2024 14:24:10 -0400 Subject: [PATCH 6/6] Changed cluster_file output filename for clarity --- modules/local/cluster_file/main.nf | 4 ++-- tests/modules/cluster_file/main.nf.test | 4 ++-- tests/pipelines/main.nf.test | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/modules/local/cluster_file/main.nf b/modules/local/cluster_file/main.nf index dfb8004..0a97545 100644 --- a/modules/local/cluster_file/main.nf +++ b/modules/local/cluster_file/main.nf @@ -6,7 +6,7 @@ process CLUSTER_FILE { val meta output: - path("expected_clusters.txt"), emit: text + path("reference_clusters.txt"), emit: text exec: def outputLines = [] @@ -37,7 +37,7 @@ process CLUSTER_FILE { } // Write the text file, iterating over each sample - task.workDir.resolve("expected_clusters.txt").withWriter { writer -> + task.workDir.resolve("reference_clusters.txt").withWriter { writer -> outputLines.each { line -> writer.writeLine(line) } diff --git a/tests/modules/cluster_file/main.nf.test b/tests/modules/cluster_file/main.nf.test index 43fd71c..3f13833 100644 --- a/tests/modules/cluster_file/main.nf.test +++ b/tests/modules/cluster_file/main.nf.test @@ -25,8 +25,8 @@ nextflow_process { assert process.success assert path("$launchDir/cluster_results").exists() - // Check expected_clusters - def actual_clusters = path("$launchDir/cluster_results/cluster/expected_clusters.txt") + // Check reference_clusters file + def actual_clusters = path("$launchDir/cluster_results/cluster/reference_clusters.txt") def expected_clusters = path("$baseDir/tests/data/clusters/expected_clusters.txt") assert actual_clusters.text == expected_clusters.text } diff --git a/tests/pipelines/main.nf.test b/tests/pipelines/main.nf.test index e3a467a..53ad3d1 100644 --- a/tests/pipelines/main.nf.test +++ b/tests/pipelines/main.nf.test @@ -33,7 +33,7 @@ nextflow_pipeline { assert actual_distances.text == expected_distances.text // Verify cluster file - def actual_cluster = path("$launchDir/results/cluster/expected_clusters.txt") + def actual_cluster = path("$launchDir/results/cluster/reference_clusters.txt") def expected_cluster = path("$baseDir/tests/data/clusters/expected_clusters.txt") assert actual_cluster.text == expected_cluster.text @@ -84,7 +84,7 @@ nextflow_pipeline { assert actual_distances.text == expected_distances.text // Verify cluster file - def actual_cluster = path("$launchDir/results/cluster/expected_clusters.txt") + def actual_cluster = path("$launchDir/results/cluster/reference_clusters.txt") def expected_cluster = path("$baseDir/tests/data/clusters/expected_clusters.txt") assert actual_cluster.text == expected_cluster.text