From f8dc199868698c88b6794a78fd31a52bce01f4d6 Mon Sep 17 00:00:00 2001 From: maxibor Date: Fri, 22 Nov 2024 15:45:34 +0100 Subject: [PATCH] add sam2lca build db subworkflow --- bin/create_acc2tax.py | 29 ++++++ bin/sam2lca_json.py | 36 ++++++++ modules/local/create_acc2tax.nf | 22 +++++ modules/local/sam2lca/prep_db/main.nf | 25 +++++ .../local/sam2lca/updatedb/environment.yml | 7 ++ modules/local/sam2lca/updatedb/main.nf | 62 +++++++++++++ modules/local/sam2lca/updatedb/meta.yml | 80 ++++++++++++++++ .../local/sam2lca/updatedb/tests/main.nf.test | 35 +++++++ .../sam2lca/updatedb/tests/main.nf.test.snap | 92 +++++++++++++++++++ subworkflows/local/sam2lca_db.nf | 35 +++++++ workflows/coproid.nf | 31 +++++-- 11 files changed, 445 insertions(+), 9 deletions(-) create mode 100755 bin/create_acc2tax.py create mode 100755 bin/sam2lca_json.py create mode 100755 modules/local/create_acc2tax.nf create mode 100644 modules/local/sam2lca/prep_db/main.nf create mode 100644 modules/local/sam2lca/updatedb/environment.yml create mode 100644 modules/local/sam2lca/updatedb/main.nf create mode 100644 modules/local/sam2lca/updatedb/meta.yml create mode 100644 modules/local/sam2lca/updatedb/tests/main.nf.test create mode 100644 modules/local/sam2lca/updatedb/tests/main.nf.test.snap create mode 100755 subworkflows/local/sam2lca_db.nf diff --git a/bin/create_acc2tax.py b/bin/create_acc2tax.py new file mode 100755 index 0000000..6f0daaa --- /dev/null +++ b/bin/create_acc2tax.py @@ -0,0 +1,29 @@ +#!/usr/bin/env python + +import argparse +import pysam +from pathlib import Path + + +def parse_args(): + parser = argparse.ArgumentParser("Create acc2tax file") + parser.add_argument("genome", type=Path, help="Path to genome file") + parser.add_argument("-t", type=int, dest="taxid", help="taxid") + + return parser.parse_args() + + +def acc2tax(genome, taxid): + entry_dict = dict() + with pysam.FastxFile(genome) as fh: + for entry in fh: + entry_dict[entry.name] = [entry.name.split(".")[0], taxid] + with open(f"{taxid}.accession2taxid", "w") as fh: + fh.write("accession\taccession.version\ttaxid\n") + for k, v in entry_dict.items(): + fh.write(f"{v[0]}\t{k}\t{v[1]}\n") + + +if __name__ == "__main__": + args = parse_args() + acc2tax(args.genome, args.taxid) diff --git a/bin/sam2lca_json.py b/bin/sam2lca_json.py new file mode 100755 index 0000000..32d5432 --- /dev/null +++ b/bin/sam2lca_json.py @@ -0,0 +1,36 @@ +#!/usr/bin/env python + + +import os +import json +import argparse +from pathlib import Path + + +def parse_args(): + parser = argparse.ArgumentParser("Create sam2lca json file") + parser.add_argument( + "acc2taxid", type=Path, help="Path to accession2taxid gzip compressed file" + ) + parser.add_argument( + "md5", + type=Path, + help="Path to accession2taxid gzip compressed md5 checksum file", + ) + + return parser.parse_args() + + +def write_json(acc2taxid, md5, db_name="adnamap"): + sam2lca_dict = { + "mapfiles": {db_name: [acc2taxid.as_posix()]}, + "mapmd5": {db_name: [md5.as_posix()]}, + "map_db": {db_name: f"{db_name}.db"}, + } + with open(f"{db_name}.sam2lca.json", "w") as fh: + json.dump(sam2lca_dict, fh) + + +if __name__ == "__main__": + args = parse_args() + write_json(args.acc2taxid, args.md5) diff --git a/modules/local/create_acc2tax.nf b/modules/local/create_acc2tax.nf new file mode 100755 index 0000000..a66b72c --- /dev/null +++ b/modules/local/create_acc2tax.nf @@ -0,0 +1,22 @@ +process CREATE_ACC2TAX { + tag "${meta.genome_name}" + label 'process_single' + + conda (params.enable_conda ? "bioconda::sam2lca=1.1.4" : null) + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/sam2lca:1.1.4--pyhdfd78af_0' : + 'quay.io/biocontainers/sam2lca:1.1.4--pyhdfd78af_0' }" + + input: + tuple val(meta), path(fasta) + + output: + path("*.accession2taxid"), emit: acc2tax + + script: + def args = task.ext.args ?: "" + + """ + create_acc2tax.py $fasta -t ${meta.taxid} + """ +} diff --git a/modules/local/sam2lca/prep_db/main.nf b/modules/local/sam2lca/prep_db/main.nf new file mode 100644 index 0000000..1898233 --- /dev/null +++ b/modules/local/sam2lca/prep_db/main.nf @@ -0,0 +1,25 @@ +process SAM2LCA_PREPDB { + label 'process_single' + + conda (params.enable_conda ? "bioconda::sam2lca=1.1.4--pyhdfd78af_0" : null) + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/sam2lca:1.1.4--pyhdfd78af_0' : + 'quay.io/biocontainers/sam2lca:1.1.4--pyhdfd78af_0' }" + + input: + path(acc2tax) + + output: + path("*.md5"), emit: acc2tax_md5 + path("*.json"), emit: acc2tax_json + path("*.gz"), emit: acc2tax_gz + + script: + def args = task.ext.args ?: "" + + """ + gzip $acc2tax + md5sum ${acc2tax}.gz > ${acc2tax}.gz.md5 + sam2lca_json.py ${acc2tax}.gz ${acc2tax}.gz.md5 + """ +} diff --git a/modules/local/sam2lca/updatedb/environment.yml b/modules/local/sam2lca/updatedb/environment.yml new file mode 100644 index 0000000..4696aa7 --- /dev/null +++ b/modules/local/sam2lca/updatedb/environment.yml @@ -0,0 +1,7 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - "bioconda::sam2lca=1.1.4" diff --git a/modules/local/sam2lca/updatedb/main.nf b/modules/local/sam2lca/updatedb/main.nf new file mode 100644 index 0000000..35f77c5 --- /dev/null +++ b/modules/local/sam2lca/updatedb/main.nf @@ -0,0 +1,62 @@ +process SAM2LCA_UPDATEDB { + tag "${acc2tax_name}" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/sam2lca:1.1.4--pyhdfd78af_0': + 'biocontainers/sam2lca:1.1.4--pyhdfd78af_0' }" + + input: + val(acc2tax_name) + val(taxo_db_name) + path(taxo_nodes)// nodes.dmp + path(taxo_names) // names.dmp + path(taxo_merged) // merged.dmp + path(acc2tax_json) // optional + path(acc2tax) // acc2tax.gz + path(acc2tax_md5) // acc2tax.gz.md5 + + output: + path "sam2lca_db" , emit: sam2lca_db + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def names = taxo_names ? "--taxo_names ${taxo_names}" : '' + def nodes = taxo_nodes ? "--taxo_nodes ${taxo_nodes}" : '' + def merged = taxo_merged ? "--taxo_merged ${taxo_merged}" : '' + def json = acc2tax_json ? "--acc2tax_json ${acc2tax_json}" : '' + """ + mkdir -p sam2lca_db + + sam2lca -d sam2lca_db \\ + update-db \\ + -t $taxo_db_name \\ + $names \\ + $nodes \\ + $merged \\ + -a $acc2tax_name \\ + $json \\ + $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + sam2lca: \$(echo \$(sam2lca --version 2>&1) | sed 's/^sam2lca, version //' ) + END_VERSIONS + """ + + stub: + """ + mkdir -p sam2lca_db + touch sam2lca_db/test.pkl + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + sam2lca: \$(echo \$(sam2lca --version 2>&1) | sed 's/^sam2lca, version //' ) + END_VERSIONS + """ +} diff --git a/modules/local/sam2lca/updatedb/meta.yml b/modules/local/sam2lca/updatedb/meta.yml new file mode 100644 index 0000000..9d474ae --- /dev/null +++ b/modules/local/sam2lca/updatedb/meta.yml @@ -0,0 +1,80 @@ +name: "sam2lca_updatedb" +description: Build sam2lca database for calling lowest common ancestors from multi-mapped reads in SAM/BAM/CRAM + files +keywords: + - LCA + - alignment + - bam + - metagenomics + - Ancestor + - multimapper + - build + - database +tools: + - "sam2lca": + description: "Lowest Common Ancestor on SAM/BAM/CRAM alignment files" + homepage: "https://github.com/maxibor/sam2lca" + documentation: "https://sam2lca.readthedocs.io" + doi: "10.21105/joss.04360" + licence: ["GPL v3"] + identifier: "" + +input: + - - acc2tax_name: + type: string + description: Name of accession2taxid type to use + - - taxo_db_name: + type: string + description: Name of taxonomy dabase type to use + - - taxo_nodes: + type: file + description: "NCBI taxonomy nodes file" + pattern: "*.dmp" + ontologies: + - edam: http://edamontology.org/format_2330 + - - taxo_names: + type: file + description: NCBI taxonomy names file + pattern: "*.dmp" + ontologies: + - edam: http://edamontology.org/format_2330 + - - taxo_merged: + type: file + description: NCBI taxonomy merged file + pattern: "*.dmp" + ontologies: + - edam: http://edamontology.org/format_2330 + - - acc2tax_json: + type: file + description: JSON file listing accession2taxid mapping files. Only required if using a custom database + pattern: "*.json" + ontologies: + - edam: "http://edamontology.org/format_3464" + - - acc2tax: + type: string + description: accession2taxid mapping file compressed with gzip. Only required if using a custom database + pattern: "*.gz" + ontologies: + - edam: http://edamontology.org/format_3989 + - - acc2tax_md5: + type: file + description: MD5 checksum of the accession2taxid mapping file. Only required if using a custom database + pattern: "*.md5" + ontologies: + - edam: http://edamontology.org/format_2330 + +output: + - sam2lca_db: + - sam2lca_db: + type: directory + description: "sam2lca database" + - versions: + - "versions.yml": + type: file + description: File containing software versions + pattern: "versions.yml" + +authors: + - "@maxibor" +maintainers: + - "@maxibor" diff --git a/modules/local/sam2lca/updatedb/tests/main.nf.test b/modules/local/sam2lca/updatedb/tests/main.nf.test new file mode 100644 index 0000000..1af2eec --- /dev/null +++ b/modules/local/sam2lca/updatedb/tests/main.nf.test @@ -0,0 +1,35 @@ +nextflow_process { + + name "Test Process SAM2LCA_UPDATEDB" + script "../main.nf" + process "SAM2LCA_UPDATEDB" + + tag "modules" + tag "modules_nfcore" + tag "sam2lca" + tag "sam2lca/updatedb" + + test("test-sam2lca-updatedb - test dataset") { + when { + process { + """ + input[0] = 'test' + input[1] = 'test' + input[2] = [] + input[3] = [] + input[4] = [] + input[5] = [] + input[6] = [] + input[7] = [] + """ + } + } + + then { + assertAll( + { assert process.success } + ) + } + } + +} diff --git a/modules/local/sam2lca/updatedb/tests/main.nf.test.snap b/modules/local/sam2lca/updatedb/tests/main.nf.test.snap new file mode 100644 index 0000000..072e340 --- /dev/null +++ b/modules/local/sam2lca/updatedb/tests/main.nf.test.snap @@ -0,0 +1,92 @@ +{ + "test-sam2lca-updatedb - test taxonomy": { + "content": [ + { + "0": [ + [ + "merged.dmp:md5,d41d8cd98f00b204e9800998ecf8427e", + "merged.dmp.gz:md5,d41d8cd98f00b204e9800998ecf8427e", + "merged.dmp.gz.md5:md5,f6e01130c21a58a3371eddec53a18f6f", + "names.dmp:md5,e7994ec89470481e031b3ecef616e778", + "names.dmp.gz:md5,e7994ec89470481e031b3ecef616e778", + "names.dmp.gz.md5:md5,ce7546bbac7dcbe5c0054975538d7fb7", + "nodes.dmp:md5,9e934f98f3c2ace17fa3d77eb235f96f", + "nodes.dmp.gz:md5,9e934f98f3c2ace17fa3d77eb235f96f", + "nodes.dmp.gz.md5:md5,a4597c31842067abe62dda359f8bd854", + [ + "000005.log:md5,9167e183e1dc6070dbfc81c4674d9654", + "CURRENT:md5,6752a1d65b201c13b62ea44016eb221f", + "IDENTITY:md5,730568fb30c4bd8dc6db1b95d288b9dc", + "LOCK:md5,d41d8cd98f00b204e9800998ecf8427e", + "LOG:md5,a6e02bcd897ccd9a28a40cf3dd350292", + "MANIFEST-000004:md5,17211b4c15bffd11c80dfbd5b7db77ef", + "OPTIONS-000007:md5,22187b853e27d095e1eac121574b1d95" + ], + "test.pkl:md5,d9f5e1d08d7b678281ac088cdca355c9" + ] + ], + "1": [ + "versions.yml:md5,175fb2cc18a30f7ab660efe1a42b7161" + ], + "sam2lca_db": [ + [ + "merged.dmp:md5,d41d8cd98f00b204e9800998ecf8427e", + "merged.dmp.gz:md5,d41d8cd98f00b204e9800998ecf8427e", + "merged.dmp.gz.md5:md5,f6e01130c21a58a3371eddec53a18f6f", + "names.dmp:md5,e7994ec89470481e031b3ecef616e778", + "names.dmp.gz:md5,e7994ec89470481e031b3ecef616e778", + "names.dmp.gz.md5:md5,ce7546bbac7dcbe5c0054975538d7fb7", + "nodes.dmp:md5,9e934f98f3c2ace17fa3d77eb235f96f", + "nodes.dmp.gz:md5,9e934f98f3c2ace17fa3d77eb235f96f", + "nodes.dmp.gz.md5:md5,a4597c31842067abe62dda359f8bd854", + [ + "000005.log:md5,9167e183e1dc6070dbfc81c4674d9654", + "CURRENT:md5,6752a1d65b201c13b62ea44016eb221f", + "IDENTITY:md5,730568fb30c4bd8dc6db1b95d288b9dc", + "LOCK:md5,d41d8cd98f00b204e9800998ecf8427e", + "LOG:md5,a6e02bcd897ccd9a28a40cf3dd350292", + "MANIFEST-000004:md5,17211b4c15bffd11c80dfbd5b7db77ef", + "OPTIONS-000007:md5,22187b853e27d095e1eac121574b1d95" + ], + "test.pkl:md5,d9f5e1d08d7b678281ac088cdca355c9" + ] + ], + "versions": [ + "versions.yml:md5,175fb2cc18a30f7ab660efe1a42b7161" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.10.1" + }, + "timestamp": "2024-11-22T14:42:14.067314457" + }, + "sam2lca-updatedb - stub": { + "content": [ + { + "0": [ + [ + "test.pkl:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + "versions.yml:md5,175fb2cc18a30f7ab660efe1a42b7161" + ], + "sam2lca_db": [ + [ + "test.pkl:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,175fb2cc18a30f7ab660efe1a42b7161" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.10.1" + }, + "timestamp": "2024-11-22T14:43:21.32148453" + } +} \ No newline at end of file diff --git a/subworkflows/local/sam2lca_db.nf b/subworkflows/local/sam2lca_db.nf new file mode 100755 index 0000000..040cf86 --- /dev/null +++ b/subworkflows/local/sam2lca_db.nf @@ -0,0 +1,35 @@ +include { CREATE_ACC2TAX } from '../../modules/local/create_acc2tax' +include { SAM2LCA_PREPDB } from '../../modules/local/sam2lca/prep_db/main' +include { SAM2LCA_UPDATEDB } from '../../modules/local/sam2lca/updatedb/main' + +workflow SAM2LCA_DB { + take: + genomes // meta, fasta + taxo_nodes // nodes.dmp + taxo_names // names.dmp + taxo_merged // merged.dmp + + main: + CREATE_ACC2TAX(genomes) + + acc2tax = CREATE_ACC2TAX.out.acc2tax.collectFile( + name: 'adnamap.accession2taxid', + keepHeader: true + ) + + SAM2LCA_PREPDB(acc2tax) + + SAM2LCA_UPDATEDB( + "adnamap", + "ncbi_local", + taxo_nodes, + taxo_names, + taxo_merged, + SAM2LCA_PREPDB.out.acc2tax_json, + SAM2LCA_PREPDB.out.acc2tax_gz, + SAM2LCA_PREPDB.out.acc2tax_md5 + ) + + emit: + sam2lca_db = SAM2LCA_UPDATEDB.out.sam2lca_db +} diff --git a/workflows/coproid.nf b/workflows/coproid.nf index c127283..0f2056f 100644 --- a/workflows/coproid.nf +++ b/workflows/coproid.nf @@ -16,9 +16,9 @@ include { PYDAMAGE_ANALYZE } from '../modules/nf-core/pydamage/analyze/mai include { BBMAP_BBDUK } from '../modules/nf-core/bbmap/bbduk/main' include { KRAKEN2_KRAKEN2 } from '../modules/nf-core/kraken2/kraken2/main' include { KRAKEN_PARSE } from '../modules/local/kraken_parse' -include { KRAKEN_MERGE } from '../modules/local/kraken_merge' +include { KRAKEN_MERGE } from '../modules/local/kraken_merge' include { SOURCEPREDICT } from '../modules/nf-core/sourcepredict/main' -include { QUARTONOTEBOOK } from '../modules/nf-core/quartonotebook/main' +include { QUARTONOTEBOOK } from '../modules/nf-core/quartonotebook/main' include { paramsSummaryMap } from 'plugin/nf-schema' include { paramsSummaryMultiqc } from '../subworkflows/nf-core/utils_nfcore_pipeline' include { softwareVersionsToYAML } from '../subworkflows/nf-core/utils_nfcore_pipeline' @@ -28,13 +28,14 @@ include { methodsDescriptionText } from '../subworkflows/local/utils_nfcore_copr // SUBWORKFLOWS: Consisting of a mix of local and nf-core/modules // include { PREPARE_GENOMES } from '../subworkflows/local/prepare_genome_indices' +include { SAM2LCA_DB } from '../subworkflows/local/sam2lca_db' include { ALIGN_INDEX } from '../subworkflows/local/align_index' include { MERGE_SORT_INDEX_SAMTOOLS } from '../subworkflows/local/merge_sort_index_samtools' include { KRAKEN2_CLASSIFICATION } from '../subworkflows/local/kraken2_classification' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - CREATE CHANNELS + CREATE CHANNELS ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ @@ -66,7 +67,7 @@ workflow COPROID { // // SUBWORKFLOW: Prepare genomes from genome sheet // - + PREPARE_GENOMES ( ch_genomesheet ) @@ -119,10 +120,10 @@ workflow COPROID { ] }.dump(tag: 'reads_genomes') .set { ch_reads_genomes_index } - + ALIGN_INDEX ( ch_reads_genomes_index - ) + ) ch_versions = ch_versions.mix(ALIGN_INDEX.out.versions.first()) // ch_multiqc_files = ch_multiqc_files.mix(ALIGN_INDEX.out.log.collect{it[1]}) @@ -160,6 +161,15 @@ workflow COPROID { ) ch_versions = ch_versions.mix(MERGE_SORT_INDEX_SAMTOOLS.out.versions.first()) + SAM2LCA_DB( + PREPARE_GENOMES.out.genomes.map { + meta, fasta, index -> [meta, fasta] + }, + [], + [], + [] + ) + // // MODULE: Run sam2lca // @@ -167,7 +177,10 @@ workflow COPROID { MERGE_SORT_INDEX_SAMTOOLS.out.bam.join( MERGE_SORT_INDEX_SAMTOOLS.out.bai ), - ch_sam2lca_db + SAM2LCA_DB.out.sam2lca_db, + [], + [], + [] ) ch_sam2lca = SAM2LCA_ANALYZE.out.csv ch_versions = ch_versions.mix(SAM2LCA_ANALYZE.out.versions.first()) @@ -180,9 +193,9 @@ workflow COPROID { ch_kraken2_db ) ch_multiqc_files = ch_multiqc_files.mix(KRAKEN2_CLASSIFICATION.out.kraken_report.collect{it[1]}) - + KRAKEN2_CLASSIFICATION.out.kraken_merged_report.dump(tag: 'kraken_parse') - .map { + .map { kraken_merged_report -> [ [