diff --git a/.github/workflows/linting_comment.yml b/.github/workflows/linting_comment.yml index 0bbcd30..b706875 100644 --- a/.github/workflows/linting_comment.yml +++ b/.github/workflows/linting_comment.yml @@ -11,7 +11,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Download lint results - uses: dawidd6/action-download-artifact@v2 + uses: dawidd6/action-download-artifact@f6b0bace624032e30a85a8fd9c1a7f8f611f5737 # v3 with: workflow: linting.yml workflow_conclusion: completed @@ -21,7 +21,7 @@ jobs: run: echo "pr_number=$(cat linting-logs/PR_number.txt)" >> $GITHUB_OUTPUT - name: Post PR comment - uses: marocchino/sticky-pull-request-comment@v2 + uses: marocchino/sticky-pull-request-comment@331f8f5b4215f0445d3c07b4967662a32a2d3e31 # v2 with: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} number: ${{ steps.pr_number.outputs.pr_number }} diff --git a/.gitignore b/.gitignore index 861bd49..8250583 100644 --- a/.gitignore +++ b/.gitignore @@ -9,4 +9,4 @@ testing* *.swp /.nf-test /.nf-test.log -ids.csv +ids.csv \ No newline at end of file diff --git a/.nf-core.yml b/.nf-core.yml index 8814b7c..1f79c1a 100644 --- a/.nf-core.yml +++ b/.nf-core.yml @@ -8,6 +8,10 @@ lint: - .github/workflows/awstest.yml - .github/workflows/awsfulltest.yml - CODE_OF_CONDUCT.md + - lib/Utils.groovy + - lib/WorkflowMain.groovy + - lib/NfcoreTemplate.groovy + - lib/WorkflowFetchdatairidanext.groovy files_unchanged: - assets/sendmail_template.txt - assets/email_template.html diff --git a/CHANGELOG.md b/CHANGELOG.md index 804cc18..af9c591 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,7 +3,11 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## [Unreleased] +## [1.1.0] - 2024-04-10 + +### Added + +- The ability to handle individual download errors. These errors will be reported in `prefetch/failures_report.csv`. ## [1.0.1] - 2024-02-22 diff --git a/README.md b/README.md index e9ca09f..2390e32 100644 --- a/README.md +++ b/README.md @@ -82,6 +82,18 @@ Within the `files` section of this JSON file, all of the output paths are relati An additional example of this file can be found at [tests/data/test1_iridanext.output.json](tests/data/test1_iridanext.output.json). +## Failures + +If one or more samples fail to download, the workflow will still attempt to download all other samples in the samplesheet. The samples that fail to download will be reported in a file named `results/prefetch/failures_report.csv`. This CSV file has two columns: `sample` (the name of the sample, matching the input samplesheet) and `error_accession` (the accession that failed to download). + +For example: + +``` +sample,error_accession +ERROR1,SRR999908 +ERROR2,SRR999934 +``` + # Acknowledgements This pipeline uses code and infrastructure developed and maintained by the [nf-core][nf-core] initative, and reused here under the [MIT license](https://github.com/nf-core/tools/blob/master/LICENSE). diff --git a/conf/iridanext.config b/conf/iridanext.config index 216c6cb..b336dfb 100644 --- a/conf/iridanext.config +++ b/conf/iridanext.config @@ -6,6 +6,7 @@ iridanext { validate = true files { idkey = "id" + global = ["**/prefetch/failures_report.csv"] samples = ["**/reads/*.fastq.gz"] } } diff --git a/conf/modules.config b/conf/modules.config index dd1fdfe..7573f95 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -27,6 +27,7 @@ process { } withName: SRATOOLS_PREFETCH { + errorStrategy = 'ignore' maxForks = params.max_jobs_with_network_connections } diff --git a/lib/nfcore_external_java_deps.jar b/lib/nfcore_external_java_deps.jar deleted file mode 100644 index 805c8bb..0000000 Binary files a/lib/nfcore_external_java_deps.jar and /dev/null differ diff --git a/modules/local/prefetchchecker/main.nf b/modules/local/prefetchchecker/main.nf new file mode 100644 index 0000000..bec2527 --- /dev/null +++ b/modules/local/prefetchchecker/main.nf @@ -0,0 +1,21 @@ +process PREFETCH_CHECKER { + tag "prefetch_checker" + label 'process_low' + + input: + val failures // list of failures + + output: + path("failures_report.csv"), emit: failure_report + + exec: + task.workDir.resolve("failures_report.csv").withWriter { writer -> + + writer.writeLine("sample,error_accession") // header + + // Failures + if (failures.size() > 0) { + failures.each { writer.writeLine "${it[0].id},${it[1]}" } + } + } +} diff --git a/modules/nf-core/custom/dumpsoftwareversions/templates/dumpsoftwareversions.py b/modules/nf-core/custom/dumpsoftwareversions/templates/dumpsoftwareversions.py index da03340..4a99360 100755 --- a/modules/nf-core/custom/dumpsoftwareversions/templates/dumpsoftwareversions.py +++ b/modules/nf-core/custom/dumpsoftwareversions/templates/dumpsoftwareversions.py @@ -58,7 +58,9 @@ def main(): } with open("$versions") as f: - versions_by_process = yaml.load(f, Loader=yaml.BaseLoader) | versions_this_module + versions_by_process = ( + yaml.load(f, Loader=yaml.BaseLoader) | versions_this_module + ) # aggregate versions by the module name (derived from fully-qualified process name) versions_by_module = {} diff --git a/nextflow.config b/nextflow.config index 429d5d1..5e5b9b2 100644 --- a/nextflow.config +++ b/nextflow.config @@ -198,7 +198,7 @@ manifest { description = """IRIDA Next pipeline for fetching data from NCBI""" mainScript = 'main.nf' nextflowVersion = '!>=23.04.0' - version = '1.0.1' + version = '1.1.0' doi = '' defaultBranch = 'main' } diff --git a/pyproject.toml b/pyproject.toml index 0d62beb..7d08e1c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,10 +1,13 @@ -# Config file for Python. Mostly used to configure linting of bin/check_samplesheet.py with Black. +# Config file for Python. Mostly used to configure linting of bin/*.py with Ruff. # Should be kept the same as nf-core/tools to avoid fighting with template synchronisation. -[tool.black] +[tool.ruff] line-length = 120 -target_version = ["py37", "py38", "py39", "py310"] +target-version = "py38" +select = ["I", "E1", "E4", "E7", "E9", "F", "UP", "N"] +cache-dir = "~/.cache/ruff" -[tool.isort] -profile = "black" -known_first_party = ["nf_core"] -multi_line_output = 3 +[tool.ruff.isort] +known-first-party = ["nf_core"] + +[tool.ruff.per-file-ignores] +"__init__.py" = ["E402", "F401"] diff --git a/subworkflows/local/fastq_download_prefetch_fasterqdump_sratools/main.nf b/subworkflows/local/fastq_download_prefetch_fasterqdump_sratools/main.nf index 68b718e..c83ea57 100644 --- a/subworkflows/local/fastq_download_prefetch_fasterqdump_sratools/main.nf +++ b/subworkflows/local/fastq_download_prefetch_fasterqdump_sratools/main.nf @@ -1,5 +1,6 @@ include { CUSTOM_SRATOOLSNCBISETTINGS } from '../../../modules/nf-core/custom/sratoolsncbisettings/main' include { SRATOOLS_PREFETCH } from '../../../modules/nf-core/sratools/prefetch/main' +include { PREFETCH_CHECKER } from '../../../modules/local/prefetchchecker/main' include { SRATOOLS_FASTERQDUMP } from '../../../modules/local/sratools/fasterqdump/main' // @@ -27,6 +28,12 @@ workflow FASTQ_DOWNLOAD_PREFETCH_FASTERQDUMP_SRATOOLS { SRATOOLS_PREFETCH ( ch_sra_ids, ch_ncbi_settings, ch_dbgap_key ) ch_versions = ch_versions.mix(SRATOOLS_PREFETCH.out.versions.first()) + fetches = ch_sra_ids.join(SRATOOLS_PREFETCH.out.sra, remainder: true) + failed_fetches = fetches.filter { it[2] == null } + .toList() + + PREFETCH_CHECKER (failed_fetches) + // // Convert the SRA format into one or more compressed FASTQ files. // diff --git a/tests/data/errorsheet.csv b/tests/data/errorsheet.csv new file mode 100644 index 0000000..2315852 --- /dev/null +++ b/tests/data/errorsheet.csv @@ -0,0 +1,5 @@ +sample,insdc_accession +SAMPLE1,ERR1109373 +ERROR1,SRR999908 +ERROR2,SRR999934 +SAMPLE2,SRR13191702 diff --git a/tests/data/prefetch_errors_iridanext.output.json b/tests/data/prefetch_errors_iridanext.output.json new file mode 100644 index 0000000..df07228 --- /dev/null +++ b/tests/data/prefetch_errors_iridanext.output.json @@ -0,0 +1,35 @@ +{ + "files": { + "global": [ + { + "path": "prefetch/failures_report.csv" + } + ], + "samples": { + "SAMPLE1": [ + { + "path": "reads/ERR1109373_2.fastq.gz" + }, + { + "path": "reads/ERR1109373_1.fastq.gz" + }, + { + "path": "reads/ERR1109373.fastq.gz" + } + ], + "SAMPLE2": [ + { + "path": "reads/SRR13191702_2.fastq.gz" + }, + { + "path": "reads/SRR13191702_1.fastq.gz" + } + ] + } + }, + "metadata": { + "samples": { + + } + } +} \ No newline at end of file diff --git a/tests/data/test1_iridanext.output.json b/tests/data/test1_iridanext.output.json index cfa4828..247e2c4 100644 --- a/tests/data/test1_iridanext.output.json +++ b/tests/data/test1_iridanext.output.json @@ -1,7 +1,9 @@ { "files": { "global": [ - + { + "path": "prefetch/failures_report.csv" + } ], "samples": { "SAMPLE2": [ diff --git a/tests/pipelines/fetchdatairidanext.nf.test b/tests/pipelines/fetchdatairidanext.nf.test index 698dba9..4f60f64 100644 --- a/tests/pipelines/fetchdatairidanext.nf.test +++ b/tests/pipelines/fetchdatairidanext.nf.test @@ -25,4 +25,33 @@ nextflow_pipeline { assert path("$launchDir/test1_out/reads/SRR13191702_2.fastq.gz").linesGzip.size() == 364 } } + + test("integration test with prefetch failures") { + + when { + params { + input = "$baseDir/tests/data/errorsheet.csv" + outdir = "results" + } + } + + then { + assert workflow.success + + // IRIDA Next output file + assert path("$launchDir/results/iridanext.output.json").json == path("$baseDir/tests/data/prefetch_errors_iridanext.output.json").json + + // Output data: + assert path("$launchDir/results/reads/ERR1109373_1.fastq.gz").linesGzip.size() == 512 + assert path("$launchDir/results/reads/ERR1109373_2.fastq.gz").linesGzip.size() == 512 + assert path("$launchDir/results/reads/SRR13191702_1.fastq.gz").linesGzip.size() == 364 + assert path("$launchDir/results/reads/SRR13191702_2.fastq.gz").linesGzip.size() == 364 + + // These files should have failed, and have no output reads: + assert path("$launchDir/results/reads/SRR999908_1.fastq.gz").exists() == false + assert path("$launchDir/results/reads/SRR999908_2.fastq.gz").exists() == false + assert path("$launchDir/results/reads/SRR999934_1.fastq.gz").exists() == false + assert path("$launchDir/results/reads/SRR999934_2.fastq.gz").exists() == false + } + } } diff --git a/tests/workflows/fastq_download_prefetch_fasterqdump_sratools/main.nf.test b/tests/workflows/fastq_download_prefetch_fasterqdump_sratools/main.nf.test index cf160e1..e4ee30f 100644 --- a/tests/workflows/fastq_download_prefetch_fasterqdump_sratools/main.nf.test +++ b/tests/workflows/fastq_download_prefetch_fasterqdump_sratools/main.nf.test @@ -32,6 +32,46 @@ nextflow_workflow { { assert workflow.success }, { assert snapshot(workflow.out).match() } ) + + assert path("$launchDir/output").exists() + + def lines = path("$launchDir/output/prefetch/failures_report.csv").readLines() + assert lines.size() == 1 + assert lines.contains("sample,error_accession") + assert lines.contains("test_single_end,DRR000774").equals(false) + assert lines.contains("test_paired_end,SRR11140744").equals(false) + } + } + + test("Download errors: 403 and invalid") { + + when { + workflow { + """ + input[0] = Channel.of( + [[ id:'SAMPLE1', single_end:false ], 'ERR1109373'], + [[ id:'ERROR1', single_end:false ], 'SRR999908'], + [[ id:'ERROR2', single_end:false ], 'INVALID!!'], + [[ id:'SAMPLE2', single_end:false ], 'SRR13191702'] + ) + input[1] = [] + """ + } + params { + outdir = "output" + } + } + + then { + assert workflow.success + assert path("$launchDir/output").exists() + + def lines = path("$launchDir/output/prefetch/failures_report.csv").readLines() + assert lines.size() == 3 + assert lines.contains("sample,error_accession") + assert lines.contains("ERROR1,SRR999908") + assert lines.contains("ERROR2,INVALID!!") } } + }