Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add sam2lca build db subworkflow #66

Merged
merged 1 commit into from
Dec 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 29 additions & 0 deletions bin/create_acc2tax.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
#!/usr/bin/env python

import argparse
import pysam
from pathlib import Path


def parse_args():
parser = argparse.ArgumentParser("Create acc2tax file")
parser.add_argument("genome", type=Path, help="Path to genome file")
parser.add_argument("-t", type=int, dest="taxid", help="taxid")

return parser.parse_args()


def acc2tax(genome, taxid):
entry_dict = dict()
with pysam.FastxFile(genome) as fh:
for entry in fh:
entry_dict[entry.name] = [entry.name.split(".")[0], taxid]
with open(f"{taxid}.accession2taxid", "w") as fh:
fh.write("accession\taccession.version\ttaxid\n")
for k, v in entry_dict.items():
fh.write(f"{v[0]}\t{k}\t{v[1]}\n")


if __name__ == "__main__":
args = parse_args()
acc2tax(args.genome, args.taxid)
36 changes: 36 additions & 0 deletions bin/sam2lca_json.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
#!/usr/bin/env python


import os
import json
import argparse
from pathlib import Path


def parse_args():
parser = argparse.ArgumentParser("Create sam2lca json file")
parser.add_argument(
"acc2taxid", type=Path, help="Path to accession2taxid gzip compressed file"
)
parser.add_argument(
"md5",
type=Path,
help="Path to accession2taxid gzip compressed md5 checksum file",
)

return parser.parse_args()


def write_json(acc2taxid, md5, db_name="adnamap"):
sam2lca_dict = {
"mapfiles": {db_name: [acc2taxid.as_posix()]},
"mapmd5": {db_name: [md5.as_posix()]},
"map_db": {db_name: f"{db_name}.db"},
}
with open(f"{db_name}.sam2lca.json", "w") as fh:
json.dump(sam2lca_dict, fh)


if __name__ == "__main__":
args = parse_args()
write_json(args.acc2taxid, args.md5)
22 changes: 22 additions & 0 deletions modules/local/create_acc2tax.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
process CREATE_ACC2TAX {
tag "${meta.genome_name}"
label 'process_single'

conda (params.enable_conda ? "bioconda::sam2lca=1.1.4" : null)
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'https://depot.galaxyproject.org/singularity/sam2lca:1.1.4--pyhdfd78af_0' :
'quay.io/biocontainers/sam2lca:1.1.4--pyhdfd78af_0' }"

input:
tuple val(meta), path(fasta)

output:
path("*.accession2taxid"), emit: acc2tax

script:
def args = task.ext.args ?: ""

"""
create_acc2tax.py $fasta -t ${meta.taxid}
"""
}
25 changes: 25 additions & 0 deletions modules/local/sam2lca/prep_db/main.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
process SAM2LCA_PREPDB {
label 'process_single'

conda (params.enable_conda ? "bioconda::sam2lca=1.1.4--pyhdfd78af_0" : null)
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'https://depot.galaxyproject.org/singularity/sam2lca:1.1.4--pyhdfd78af_0' :
'quay.io/biocontainers/sam2lca:1.1.4--pyhdfd78af_0' }"

input:
path(acc2tax)

output:
path("*.md5"), emit: acc2tax_md5
path("*.json"), emit: acc2tax_json
path("*.gz"), emit: acc2tax_gz

script:
def args = task.ext.args ?: ""

"""
gzip $acc2tax
md5sum ${acc2tax}.gz > ${acc2tax}.gz.md5
sam2lca_json.py ${acc2tax}.gz ${acc2tax}.gz.md5
"""
}
7 changes: 7 additions & 0 deletions modules/local/sam2lca/updatedb/environment.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
---
# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json
channels:
- conda-forge
- bioconda
dependencies:
- "bioconda::sam2lca=1.1.4"
62 changes: 62 additions & 0 deletions modules/local/sam2lca/updatedb/main.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
process SAM2LCA_UPDATEDB {
tag "${acc2tax_name}"
label 'process_single'

conda "${moduleDir}/environment.yml"
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'https://depot.galaxyproject.org/singularity/sam2lca:1.1.4--pyhdfd78af_0':
'biocontainers/sam2lca:1.1.4--pyhdfd78af_0' }"

input:
val(acc2tax_name)
val(taxo_db_name)
path(taxo_nodes)// nodes.dmp
path(taxo_names) // names.dmp
path(taxo_merged) // merged.dmp
path(acc2tax_json) // optional
path(acc2tax) // acc2tax.gz
path(acc2tax_md5) // acc2tax.gz.md5

output:
path "sam2lca_db" , emit: sam2lca_db
path "versions.yml" , emit: versions

when:
task.ext.when == null || task.ext.when

script:
def args = task.ext.args ?: ''
def names = taxo_names ? "--taxo_names ${taxo_names}" : ''
def nodes = taxo_nodes ? "--taxo_nodes ${taxo_nodes}" : ''
def merged = taxo_merged ? "--taxo_merged ${taxo_merged}" : ''
def json = acc2tax_json ? "--acc2tax_json ${acc2tax_json}" : ''
"""
mkdir -p sam2lca_db

sam2lca -d sam2lca_db \\
update-db \\
-t $taxo_db_name \\
$names \\
$nodes \\
$merged \\
-a $acc2tax_name \\
$json \\
$args

cat <<-END_VERSIONS > versions.yml
"${task.process}":
sam2lca: \$(echo \$(sam2lca --version 2>&1) | sed 's/^sam2lca, version //' )
END_VERSIONS
"""

stub:
"""
mkdir -p sam2lca_db
touch sam2lca_db/test.pkl

cat <<-END_VERSIONS > versions.yml
"${task.process}":
sam2lca: \$(echo \$(sam2lca --version 2>&1) | sed 's/^sam2lca, version //' )
END_VERSIONS
"""
}
80 changes: 80 additions & 0 deletions modules/local/sam2lca/updatedb/meta.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
name: "sam2lca_updatedb"
description: Build sam2lca database for calling lowest common ancestors from multi-mapped reads in SAM/BAM/CRAM
files
keywords:
- LCA
- alignment
- bam
- metagenomics
- Ancestor
- multimapper
- build
- database
tools:
- "sam2lca":
description: "Lowest Common Ancestor on SAM/BAM/CRAM alignment files"
homepage: "https://github.com/maxibor/sam2lca"
documentation: "https://sam2lca.readthedocs.io"
doi: "10.21105/joss.04360"
licence: ["GPL v3"]
identifier: ""

input:
- - acc2tax_name:
type: string
description: Name of accession2taxid type to use
- - taxo_db_name:
type: string
description: Name of taxonomy dabase type to use
- - taxo_nodes:
type: file
description: "NCBI taxonomy nodes file"
pattern: "*.dmp"
ontologies:
- edam: http://edamontology.org/format_2330
- - taxo_names:
type: file
description: NCBI taxonomy names file
pattern: "*.dmp"
ontologies:
- edam: http://edamontology.org/format_2330
- - taxo_merged:
type: file
description: NCBI taxonomy merged file
pattern: "*.dmp"
ontologies:
- edam: http://edamontology.org/format_2330
- - acc2tax_json:
type: file
description: JSON file listing accession2taxid mapping files. Only required if using a custom database
pattern: "*.json"
ontologies:
- edam: "http://edamontology.org/format_3464"
- - acc2tax:
type: string
description: accession2taxid mapping file compressed with gzip. Only required if using a custom database
pattern: "*.gz"
ontologies:
- edam: http://edamontology.org/format_3989
- - acc2tax_md5:
type: file
description: MD5 checksum of the accession2taxid mapping file. Only required if using a custom database
pattern: "*.md5"
ontologies:
- edam: http://edamontology.org/format_2330

output:
- sam2lca_db:
- sam2lca_db:
type: directory
description: "sam2lca database"
- versions:
- "versions.yml":
type: file
description: File containing software versions
pattern: "versions.yml"

authors:
- "@maxibor"
maintainers:
- "@maxibor"
35 changes: 35 additions & 0 deletions modules/local/sam2lca/updatedb/tests/main.nf.test
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
nextflow_process {

name "Test Process SAM2LCA_UPDATEDB"
script "../main.nf"
process "SAM2LCA_UPDATEDB"

tag "modules"
tag "modules_nfcore"
tag "sam2lca"
tag "sam2lca/updatedb"

test("test-sam2lca-updatedb - test dataset") {
when {
process {
"""
input[0] = 'test'
input[1] = 'test'
input[2] = []
input[3] = []
input[4] = []
input[5] = []
input[6] = []
input[7] = []
"""
}
}

then {
assertAll(
{ assert process.success }
)
}
}

}
92 changes: 92 additions & 0 deletions modules/local/sam2lca/updatedb/tests/main.nf.test.snap
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
{
"test-sam2lca-updatedb - test taxonomy": {
"content": [
{
"0": [
[
"merged.dmp:md5,d41d8cd98f00b204e9800998ecf8427e",
"merged.dmp.gz:md5,d41d8cd98f00b204e9800998ecf8427e",
"merged.dmp.gz.md5:md5,f6e01130c21a58a3371eddec53a18f6f",
"names.dmp:md5,e7994ec89470481e031b3ecef616e778",
"names.dmp.gz:md5,e7994ec89470481e031b3ecef616e778",
"names.dmp.gz.md5:md5,ce7546bbac7dcbe5c0054975538d7fb7",
"nodes.dmp:md5,9e934f98f3c2ace17fa3d77eb235f96f",
"nodes.dmp.gz:md5,9e934f98f3c2ace17fa3d77eb235f96f",
"nodes.dmp.gz.md5:md5,a4597c31842067abe62dda359f8bd854",
[
"000005.log:md5,9167e183e1dc6070dbfc81c4674d9654",
"CURRENT:md5,6752a1d65b201c13b62ea44016eb221f",
"IDENTITY:md5,730568fb30c4bd8dc6db1b95d288b9dc",
"LOCK:md5,d41d8cd98f00b204e9800998ecf8427e",
"LOG:md5,a6e02bcd897ccd9a28a40cf3dd350292",
"MANIFEST-000004:md5,17211b4c15bffd11c80dfbd5b7db77ef",
"OPTIONS-000007:md5,22187b853e27d095e1eac121574b1d95"
],
"test.pkl:md5,d9f5e1d08d7b678281ac088cdca355c9"
]
],
"1": [
"versions.yml:md5,175fb2cc18a30f7ab660efe1a42b7161"
],
"sam2lca_db": [
[
"merged.dmp:md5,d41d8cd98f00b204e9800998ecf8427e",
"merged.dmp.gz:md5,d41d8cd98f00b204e9800998ecf8427e",
"merged.dmp.gz.md5:md5,f6e01130c21a58a3371eddec53a18f6f",
"names.dmp:md5,e7994ec89470481e031b3ecef616e778",
"names.dmp.gz:md5,e7994ec89470481e031b3ecef616e778",
"names.dmp.gz.md5:md5,ce7546bbac7dcbe5c0054975538d7fb7",
"nodes.dmp:md5,9e934f98f3c2ace17fa3d77eb235f96f",
"nodes.dmp.gz:md5,9e934f98f3c2ace17fa3d77eb235f96f",
"nodes.dmp.gz.md5:md5,a4597c31842067abe62dda359f8bd854",
[
"000005.log:md5,9167e183e1dc6070dbfc81c4674d9654",
"CURRENT:md5,6752a1d65b201c13b62ea44016eb221f",
"IDENTITY:md5,730568fb30c4bd8dc6db1b95d288b9dc",
"LOCK:md5,d41d8cd98f00b204e9800998ecf8427e",
"LOG:md5,a6e02bcd897ccd9a28a40cf3dd350292",
"MANIFEST-000004:md5,17211b4c15bffd11c80dfbd5b7db77ef",
"OPTIONS-000007:md5,22187b853e27d095e1eac121574b1d95"
],
"test.pkl:md5,d9f5e1d08d7b678281ac088cdca355c9"
]
],
"versions": [
"versions.yml:md5,175fb2cc18a30f7ab660efe1a42b7161"
]
}
],
"meta": {
"nf-test": "0.8.4",
"nextflow": "24.10.1"
},
"timestamp": "2024-11-22T14:42:14.067314457"
},
"sam2lca-updatedb - stub": {
"content": [
{
"0": [
[
"test.pkl:md5,d41d8cd98f00b204e9800998ecf8427e"
]
],
"1": [
"versions.yml:md5,175fb2cc18a30f7ab660efe1a42b7161"
],
"sam2lca_db": [
[
"test.pkl:md5,d41d8cd98f00b204e9800998ecf8427e"
]
],
"versions": [
"versions.yml:md5,175fb2cc18a30f7ab660efe1a42b7161"
]
}
],
"meta": {
"nf-test": "0.8.4",
"nextflow": "24.10.1"
},
"timestamp": "2024-11-22T14:43:21.32148453"
}
}
Loading
Loading