Skip to content

Commit

Permalink
Merge pull request #29 from sanger-tol/gx
Browse files Browse the repository at this point in the history
completed fcsgx workflow
  • Loading branch information
yumisims authored Oct 10, 2023
2 parents aea44bc + c30e3a1 commit 447c12f
Show file tree
Hide file tree
Showing 11 changed files with 503 additions and 8 deletions.
29 changes: 29 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,35 @@ jobs:
run: |
curl https://tolit.cog.sanger.ac.uk/test-data/resources/ascc/asccTinyTest.tar.gz | tar xzf -
- name: Download the NCBI taxdump database
run: |
mkdir ncbi_taxdump
curl -L https://ftp.ncbi.nih.gov/pub/taxonomy/new_taxdump/new_taxdump.tar.gz | tar -C ncbi_taxdump -xzf -
- name: Download the FCS-gx database
run: |
mkdir FCS_gx
wget -c https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/FCS/database/test-only/test-only.taxa.tsv -O FCS_gx/all.taxa.tsv
wget -c https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/FCS/database/test-only/test-only.gxi -O FCS_gx/all.gxi
wget -c https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/FCS/database/test-only/test-only.gxs -O FCS_gx/all.gxs
wget -c https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/FCS/database/test-only/test-only.meta.jsonl -O FCS_gx/all.meta.jsonl
wget -c https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/FCS/database/test-only/test-only.blast_div.tsv.gz -O FCS_gx/all.blast_div.tsv.gz
- name: Download the BUSCO lineage database
run: |
mkdir busco_database
curl -L https://tolit.cog.sanger.ac.uk/test-data/resources/busco/blobtoolkit.GCA_922984935.2.2023-08-03.lineages.tar.gz | tar -C busco_database -xzf -
- name: Download the kraken2 database
run: |
mkdir kraken2
curl -L https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/sarscov2/genome/db/kraken2.tar.gz | tar -C kraken2 -xzf -
- name: Download the subset of NT database
run: |
mkdir NT_database
curl -L https://ftp.ncbi.nlm.nih.gov/blast/db/18S_fungal_sequences.tar.gz | tar -C NT_database -xzf -
- name: Run pipeline with test data
# TODO nf-core: You can customise CI pipeline run tests as required
# For example: adding multiple test runs with different parameters
Expand Down
11 changes: 6 additions & 5 deletions assets/github_testing/test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,12 @@ mito_fasta_path: /home/runner/work/ascc/ascc/asccTinyTest/organellar/Pyoeliiyoel
plastid_fasta_path: /home/runner/work/ascc/ascc/asccTinyTest/organellar/Pyoeliiyoelii17XNL_apicoplast_ncbi.fa
kmer_len: 7
## Below this point will need updating as more subworkflows are built
nt_database: /data/blastdb/Supported/NT/current/
nt_kraken_db_path: /lustre/scratch123/tol/teams/tola/users/ea10/ascc_databases/nt/nt
ncbi_taxonomy_path: /lustre/scratch123/tol/teams/tola/users/ea10/databases/taxdump/
ncbi_rankedlineage_path: /lustre/scratch123/tol/teams/tola/users/ea10/databases/taxdump/rankedlineage.dmp
busco_lineages_folder: /lustre/scratch123/tol/resources/busco/data/v5/2021-03-14/lineages
nt_database: /home/runner/work/ascc/ascc/NT_database/
nt_kraken_db_path: /home/runner/work/ascc/ascc/kraken2/kraken2
ncbi_taxonomy_path: /home/runner/work/ascc/ascc/ncbi_taxdump/
ncbi_rankedlineage_path: /home/runner/work/ascc/ascc/ncbi_taxdump/rankedlineage.dmp
busco_lineages_folder: /home/runner/work/ascc/ascc/busco_database/lineages
fcs_gx_database_path: /home/runner/work/ascc/ascc/FCS_gx/
seqkit:
sliding: 6000
window: 100000
1 change: 1 addition & 0 deletions assets/test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ nt_kraken_db_path: /lustre/scratch123/tol/teams/tola/users/ea10/ascc_databases/n
ncbi_taxonomy_path: /lustre/scratch123/tol/teams/tola/users/ea10/databases/taxdump/
ncbi_rankedlineage_path: /lustre/scratch123/tol/teams/tola/users/ea10/databases/taxdump/rankedlineage.dmp
busco_lineages_folder: /lustre/scratch123/tol/resources/busco/data/v5/2021-03-14/lineages
fcs_gx_database_path: /lustre/scratch124/tol/projects/asg/sub_projects/ncbi_decon/0.4.0/gxdb
seqkit:
sliding: 6000
window: 100000
222 changes: 222 additions & 0 deletions bin/parse_fcsgx_result.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,222 @@
#!/usr/bin/env python3
"""
Script for parsing the result files of FCS-GX, originally written by Eerik Aunin (ea10)
further minor modifications by Yumi Sims (yy5)
"""

import general_purpose_functions as gpf
from collections import OrderedDict
import os
import sys
import argparse


def get_fcs_gx_outfile_paths(fcs_gx_reports_folder):
"""
Locates the FCS-GX output files in the FCS-GX output directory
"""
if os.path.isdir(fcs_gx_reports_folder) is False:
sys.stderr.write("Could not find the directory with FCS-GX output files ({})\n".format(fcs_gx_reports_folder))
sys.exit(1)

fcs_gx_outfiles = os.listdir(fcs_gx_reports_folder)
taxonomy_files = [n for n in fcs_gx_outfiles if n.endswith(".taxonomy.rpt")]
if len(taxonomy_files) != 1:
sys.stderr.write(
"Error occurred when trying to find FCS-GX *.taxonomy.rpt file in {}\n".format(fcs_gx_reports_folder)
)
sys.exit(1)
taxonomy_file = fcs_gx_reports_folder + "/" + taxonomy_files[0]

report_files = [n for n in fcs_gx_outfiles if n.endswith(".fcs_gx_report.txt")]
if len(report_files) != 1:
sys.stderr.write(
"Error occurred when trying to find FCS-GX *.fcs_gx_report.txt file in {}\n".format(fcs_gx_reports_folder)
)
sys.exit(1)
report_file = fcs_gx_reports_folder + "/" + report_files[0]
return taxonomy_file, report_file


def load_taxids_data(taxonomy_file):
"""
Parses the *.taxonomy.rpt to find taxids that correspond to species names. Returns this as a dictionary
"""
taxonomy_data = gpf.l(taxonomy_file)
assert len(taxonomy_data) > 2
taxonomy_data = taxonomy_data[2 : len(taxonomy_data)]
collection_dict = OrderedDict()

for line in taxonomy_data:
add_new_entry = False
multiple_divs_per_scaff = False
split_line = line.split("\t")
assert len(split_line) == 34
scaff = split_line[0]
if "~" in scaff:
scaff = scaff.split("~")[0]
tax_name_1 = split_line[5]
tax_id_1 = split_line[6]
div_1 = split_line[7]
cvg_by_div_1 = split_line[8]
if cvg_by_div_1 != "":
cvg_by_div_1 = int(cvg_by_div_1)
cvg_by_tax_1 = split_line[9]
if cvg_by_tax_1 != "":
cvg_by_tax_1 = int(cvg_by_tax_1)
score_1 = split_line[10]
if score_1 != "":
score_1 = int(score_1)
else:
score_1 = 0
if scaff in collection_dict:
old_score = collection_dict[scaff]["fcs_gx_score"]
if old_score < score_1:
add_new_entry = True
multiple_divs_per_scaff = True
else:
add_new_entry = True
if add_new_entry is True:
row_dict = dict()
row_dict["fcs_gx_top_tax_name"] = tax_name_1
row_dict["fcs_gx_top_taxid"] = tax_id_1
row_dict["fcs_gx_div"] = div_1
row_dict["fcs_gx_coverage_by_div"] = cvg_by_div_1
row_dict["fcs_gx_coverage_by_tax"] = cvg_by_tax_1
row_dict["fcs_gx_score"] = score_1
row_dict["fcs_gx_multiple_divs_per_scaff"] = multiple_divs_per_scaff
row_dict["fcs_gx_action"] = "NA"
collection_dict[scaff] = row_dict
return collection_dict


def load_report_data(report_file, collection_dict):
"""
Parses the *.fcs_gx_report.txt to add entries from the 'action' column to the collection of entries per scaffold that is stored in collection_dict
"""
report_data = gpf.l(report_file)
if len(report_data) > 2:
report_data = report_data[2 : len(report_data)]
for line in report_data:
split_line = line.split("\t")
assert len(split_line) == 8
scaff = split_line[0]
fcs_gx_action = split_line[4]
collection_dict[scaff]["fcs_gx_action"] = fcs_gx_action
return collection_dict


def get_taxids_list(fcs_gx_taxonomy_file_path):
"""
Goes through FCS-GX taxonomy output file and returns a list of unique taxIDs found in the file
"""
if os.path.isfile(fcs_gx_taxonomy_file_path) is False:
sys.stderr.write(
f"The FCS-GX taxonomy file was not found at the expected location ({fcs_gx_taxonomy_file_path})\n"
)
sys.exit(1)
taxids_list = list()
fcs_gx_taxonomy_data = gpf.ll(fcs_gx_taxonomy_file_path)
for line in fcs_gx_taxonomy_data:
if line.startswith("#") is False:
split_line = line.split("\t")
assert len(split_line) == 34
taxid = split_line[6]
if taxid not in taxids_list:
taxids_list.append(taxid)
return taxids_list


def get_lineages_by_taxid(taxids_list, rankedlineage_path):
"""
Takes a list of taxIDs and the path to the NCBI rankedlineage.dmp file as the input. Returns the lineage corresponding to each taxID
"""
lineages_dict = dict()
rankedlineage_col_names = (
"taxid",
"fcs_gx_name",
"fcs_gx_species",
"fcs_gx_genus",
"fcs_gx_family",
"fcs_gx_order",
"fcs_gx_class",
"fcs_gx_phylum",
"fcs_gx_kingdom",
"fcs_gx_domain",
)
if os.path.isfile(rankedlineage_path) is False:
sys.stderr.write(
f"The NCBI rankedlineage.dmp file was not found at the expected location ({rankedlineage_path})\n"
)
sys.exit(1)
rankedlineage_data = gpf.ll(rankedlineage_path)
for line in rankedlineage_data:
split_line = line.split("|")
split_line = [n.strip() for n in split_line]
assert len(split_line) == 11
taxid = split_line[0]
if taxid in taxids_list:
current_lineage_dict = dict()
for i in range(1, 10):
current_lineage_dict[rankedlineage_col_names[i]] = split_line[i]
lineages_dict[taxid] = current_lineage_dict
return lineages_dict


def main(fcs_gx_reports_folder, ncbi_rankedlineage_path):
taxonomy_file, report_file = get_fcs_gx_outfile_paths(fcs_gx_reports_folder)
collection_dict = load_taxids_data(taxonomy_file)
collection_dict = load_report_data(report_file, collection_dict)
taxids_list = get_taxids_list(taxonomy_file)
lineages_dict = get_lineages_by_taxid(taxids_list, ncbi_rankedlineage_path)
rankedlineage_col_names = (
"taxid",
"fcs_gx_name",
"fcs_gx_species",
"fcs_gx_genus",
"fcs_gx_family",
"fcs_gx_order",
"fcs_gx_class",
"fcs_gx_phylum",
"fcs_gx_kingdom",
"fcs_gx_domain",
)

out_header = "scaff,fcs_gx_top_tax_name,fcs_gx_top_taxid,fcs_gx_div,fcs_gx_coverage_by_div,fcs_gx_coverage_by_tax,fcs_gx_score,fcs_gx_multiple_divs_per_scaff,fcs_gx_action"
out_header += "," + ",".join(rankedlineage_col_names)
print(out_header)
for scaff, row_dict in collection_dict.items():
out_line = "{},{},{},{},{},{},{},{},{}".format(
scaff,
row_dict["fcs_gx_top_tax_name"],
row_dict["fcs_gx_top_taxid"],
row_dict["fcs_gx_div"],
row_dict["fcs_gx_coverage_by_div"],
row_dict["fcs_gx_coverage_by_tax"],
row_dict["fcs_gx_score"],
row_dict["fcs_gx_multiple_divs_per_scaff"],
row_dict["fcs_gx_action"],
)
row_top_taxid = row_dict["fcs_gx_top_taxid"]
if row_top_taxid in lineages_dict:
current_lineage_dict = lineages_dict[row_dict["fcs_gx_top_taxid"]]
for i in range(1, 10):
out_line += f",{current_lineage_dict[rankedlineage_col_names[i]]}"
# current_lineage_dict[rankedlineage_col_names[i]] = split_line[i]
else:
for i in range(1, 10):
out_line += ","
print(out_line)


if __name__ == "__main__":
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
"fcs_gx_reports_folder",
type=str,
help="Path to directory with FCS-GX output files (*.taxonomy.rpt and *.fcs_gx_report.txt)",
)
parser.add_argument("ncbi_rankedlineage_path", type=str, help="Path to the rankedlineage.dmp of NCBI taxonomy")
parser.add_argument("-v", action="version", version="1.0")
args = parser.parse_args()
main(args.fcs_gx_reports_folder, args.ncbi_rankedlineage_path)
5 changes: 5 additions & 0 deletions modules.json
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,11 @@
"git_sha": "5a35af8b60d45425c4b9193e567d16b614d93dbe",
"installed_by": ["modules"]
},
"fcs/fcsgx": {
"branch": "master",
"git_sha": "8c4542e5d421c4690cf1fa6ec729e9304763fdaf",
"installed_by": ["modules"]
},
"kraken2/kraken2": {
"branch": "master",
"git_sha": "603ecbd9f45300c9788f197d2a15a005685b4220",
Expand Down
40 changes: 40 additions & 0 deletions modules/local/parse_fcsgx_result.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
process PARSE_FCSGX_RESULT {
tag "${meta.id}"
label 'process_low'

conda "conda-forge::python=3.9"
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'https://depot.galaxyproject.org/singularity/python:3.9' :
'biocontainers/python:3.9' }"

input:
tuple val(meta), path(fcs_gx_reports_folder)
path ncbi_rankedlineage_path

output:
tuple val(meta), path( "*.csv" ), emit: fcsgxresult
path "versions.yml", emit: versions

script:
def args = task.ext.args ?: ''
def prefix = task.ext.prefix ?: "${meta.id}"
"""
parse_fcsgx_result.py ${fcs_gx_reports_folder} ${ncbi_rankedlineage_path} > parsed_fcsgx.csv
cat <<-END_VERSIONS > versions.yml
"${task.process}":
parse_fcsgx_result: \$(parse_fcsgx_result.py -v)
END_VERSIONS
"""

stub:
"""
touch parsed_fcsgx.csv
cat <<-END_VERSIONS > versions.yml
"${task.process}":
python: \$(python --version | sed 's/Python //g')
parse_fcsgx_result: \$(parse_fcsgx_result.py -v)
END_VERSIONS
"""
}
Loading

0 comments on commit 447c12f

Please sign in to comment.