Merge pull request #29 from sanger-tol/gx

completed fcsgx workflow
sanger-tol · Oct 10, 2023 · 447c12f · 447c12f
2 parents aea44bc + c30e3a1
commit 447c12f
Show file tree

Hide file tree

Showing 11 changed files with 503 additions and 8 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -40,6 +40,35 @@ jobs:
         run: |
           curl https://tolit.cog.sanger.ac.uk/test-data/resources/ascc/asccTinyTest.tar.gz | tar xzf -
 
+      - name: Download the NCBI taxdump database
+        run: |
+          mkdir ncbi_taxdump
+          curl -L https://ftp.ncbi.nih.gov/pub/taxonomy/new_taxdump/new_taxdump.tar.gz | tar -C ncbi_taxdump -xzf -
+
+      - name: Download the FCS-gx database
+        run: |
+          mkdir FCS_gx
+          wget -c https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/FCS/database/test-only/test-only.taxa.tsv -O FCS_gx/all.taxa.tsv
+          wget -c https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/FCS/database/test-only/test-only.gxi -O FCS_gx/all.gxi
+          wget -c https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/FCS/database/test-only/test-only.gxs -O FCS_gx/all.gxs
+          wget -c https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/FCS/database/test-only/test-only.meta.jsonl -O FCS_gx/all.meta.jsonl
+          wget -c https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/FCS/database/test-only/test-only.blast_div.tsv.gz -O FCS_gx/all.blast_div.tsv.gz
+
+      - name: Download the BUSCO lineage database
+        run: |
+          mkdir busco_database
+          curl -L https://tolit.cog.sanger.ac.uk/test-data/resources/busco/blobtoolkit.GCA_922984935.2.2023-08-03.lineages.tar.gz | tar -C busco_database -xzf -
+
+      - name: Download the kraken2 database
+        run: |
+          mkdir kraken2
+          curl -L https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/sarscov2/genome/db/kraken2.tar.gz | tar -C kraken2 -xzf -
+
+      - name: Download the subset of NT database
+        run: |
+          mkdir NT_database
+          curl -L https://ftp.ncbi.nlm.nih.gov/blast/db/18S_fungal_sequences.tar.gz | tar -C NT_database -xzf -
+
       - name: Run pipeline with test data
         # TODO nf-core: You can customise CI pipeline run tests as required
         # For example: adding multiple test runs with different parameters

diff --git a/assets/github_testing/test.yaml b/assets/github_testing/test.yaml
@@ -8,11 +8,12 @@ mito_fasta_path: /home/runner/work/ascc/ascc/asccTinyTest/organellar/Pyoeliiyoel
 plastid_fasta_path: /home/runner/work/ascc/ascc/asccTinyTest/organellar/Pyoeliiyoelii17XNL_apicoplast_ncbi.fa
 kmer_len: 7
 ## Below this point will need updating as more subworkflows are built
-nt_database: /data/blastdb/Supported/NT/current/
-nt_kraken_db_path: /lustre/scratch123/tol/teams/tola/users/ea10/ascc_databases/nt/nt
-ncbi_taxonomy_path: /lustre/scratch123/tol/teams/tola/users/ea10/databases/taxdump/
-ncbi_rankedlineage_path: /lustre/scratch123/tol/teams/tola/users/ea10/databases/taxdump/rankedlineage.dmp
-busco_lineages_folder: /lustre/scratch123/tol/resources/busco/data/v5/2021-03-14/lineages
+nt_database: /home/runner/work/ascc/ascc/NT_database/
+nt_kraken_db_path: /home/runner/work/ascc/ascc/kraken2/kraken2
+ncbi_taxonomy_path: /home/runner/work/ascc/ascc/ncbi_taxdump/
+ncbi_rankedlineage_path: /home/runner/work/ascc/ascc/ncbi_taxdump/rankedlineage.dmp
+busco_lineages_folder: /home/runner/work/ascc/ascc/busco_database/lineages
+fcs_gx_database_path: /home/runner/work/ascc/ascc/FCS_gx/
 seqkit:
   sliding: 6000
   window: 100000
diff --git a/assets/test.yaml b/assets/test.yaml
@@ -12,6 +12,7 @@ nt_kraken_db_path: /lustre/scratch123/tol/teams/tola/users/ea10/ascc_databases/n
 ncbi_taxonomy_path: /lustre/scratch123/tol/teams/tola/users/ea10/databases/taxdump/
 ncbi_rankedlineage_path: /lustre/scratch123/tol/teams/tola/users/ea10/databases/taxdump/rankedlineage.dmp
 busco_lineages_folder: /lustre/scratch123/tol/resources/busco/data/v5/2021-03-14/lineages
+fcs_gx_database_path: /lustre/scratch124/tol/projects/asg/sub_projects/ncbi_decon/0.4.0/gxdb
 seqkit:
   sliding: 6000
   window: 100000
diff --git a/bin/parse_fcsgx_result.py b/bin/parse_fcsgx_result.py
@@ -0,0 +1,222 @@
+#!/usr/bin/env python3
+"""
+Script for parsing the result files of FCS-GX, originally written by Eerik Aunin (ea10)
+further minor modifications by Yumi Sims (yy5)
+"""
+
+import general_purpose_functions as gpf
+from collections import OrderedDict
+import os
+import sys
+import argparse
+
+
+def get_fcs_gx_outfile_paths(fcs_gx_reports_folder):
+    """
+    Locates the FCS-GX output files in the FCS-GX output directory
+    """
+    if os.path.isdir(fcs_gx_reports_folder) is False:
+        sys.stderr.write("Could not find the directory with FCS-GX output files ({})\n".format(fcs_gx_reports_folder))
+        sys.exit(1)
+
+    fcs_gx_outfiles = os.listdir(fcs_gx_reports_folder)
+    taxonomy_files = [n for n in fcs_gx_outfiles if n.endswith(".taxonomy.rpt")]
+    if len(taxonomy_files) != 1:
+        sys.stderr.write(
+            "Error occurred when trying to find FCS-GX *.taxonomy.rpt file in {}\n".format(fcs_gx_reports_folder)
+        )
+        sys.exit(1)
+    taxonomy_file = fcs_gx_reports_folder + "/" + taxonomy_files[0]
+
+    report_files = [n for n in fcs_gx_outfiles if n.endswith(".fcs_gx_report.txt")]
+    if len(report_files) != 1:
+        sys.stderr.write(
+            "Error occurred when trying to find FCS-GX *.fcs_gx_report.txt file in {}\n".format(fcs_gx_reports_folder)
+        )
+        sys.exit(1)
+    report_file = fcs_gx_reports_folder + "/" + report_files[0]
+    return taxonomy_file, report_file
+
+
+def load_taxids_data(taxonomy_file):
+    """
+    Parses the *.taxonomy.rpt to find taxids that correspond to species names. Returns this as a dictionary
+    """
+    taxonomy_data = gpf.l(taxonomy_file)
+    assert len(taxonomy_data) > 2
+    taxonomy_data = taxonomy_data[2 : len(taxonomy_data)]
+    collection_dict = OrderedDict()
+
+    for line in taxonomy_data:
+        add_new_entry = False
+        multiple_divs_per_scaff = False
+        split_line = line.split("\t")
+        assert len(split_line) == 34
+        scaff = split_line[0]
+        if "~" in scaff:
+            scaff = scaff.split("~")[0]
+        tax_name_1 = split_line[5]
+        tax_id_1 = split_line[6]
+        div_1 = split_line[7]
+        cvg_by_div_1 = split_line[8]
+        if cvg_by_div_1 != "":
+            cvg_by_div_1 = int(cvg_by_div_1)
+        cvg_by_tax_1 = split_line[9]
+        if cvg_by_tax_1 != "":
+            cvg_by_tax_1 = int(cvg_by_tax_1)
+        score_1 = split_line[10]
+        if score_1 != "":
+            score_1 = int(score_1)
+        else:
+            score_1 = 0
+        if scaff in collection_dict:
+            old_score = collection_dict[scaff]["fcs_gx_score"]
+            if old_score < score_1:
+                add_new_entry = True
+                multiple_divs_per_scaff = True
+        else:
+            add_new_entry = True
+        if add_new_entry is True:
+            row_dict = dict()
+            row_dict["fcs_gx_top_tax_name"] = tax_name_1
+            row_dict["fcs_gx_top_taxid"] = tax_id_1
+            row_dict["fcs_gx_div"] = div_1
+            row_dict["fcs_gx_coverage_by_div"] = cvg_by_div_1
+            row_dict["fcs_gx_coverage_by_tax"] = cvg_by_tax_1
+            row_dict["fcs_gx_score"] = score_1
+            row_dict["fcs_gx_multiple_divs_per_scaff"] = multiple_divs_per_scaff
+            row_dict["fcs_gx_action"] = "NA"
+            collection_dict[scaff] = row_dict
+    return collection_dict
+
+
+def load_report_data(report_file, collection_dict):
+    """
+    Parses the *.fcs_gx_report.txt to add entries from the 'action' column to the collection of entries per scaffold that is stored in collection_dict
+    """
+    report_data = gpf.l(report_file)
+    if len(report_data) > 2:
+        report_data = report_data[2 : len(report_data)]
+        for line in report_data:
+            split_line = line.split("\t")
+            assert len(split_line) == 8
+            scaff = split_line[0]
+            fcs_gx_action = split_line[4]
+            collection_dict[scaff]["fcs_gx_action"] = fcs_gx_action
+    return collection_dict
+
+
+def get_taxids_list(fcs_gx_taxonomy_file_path):
+    """
+    Goes through FCS-GX taxonomy output file and returns a list of unique taxIDs found in the file
+    """
+    if os.path.isfile(fcs_gx_taxonomy_file_path) is False:
+        sys.stderr.write(
+            f"The FCS-GX taxonomy file was not found at the expected location ({fcs_gx_taxonomy_file_path})\n"
+        )
+        sys.exit(1)
+    taxids_list = list()
+    fcs_gx_taxonomy_data = gpf.ll(fcs_gx_taxonomy_file_path)
+    for line in fcs_gx_taxonomy_data:
+        if line.startswith("#") is False:
+            split_line = line.split("\t")
+            assert len(split_line) == 34
+            taxid = split_line[6]
+            if taxid not in taxids_list:
+                taxids_list.append(taxid)
+    return taxids_list
+
+
+def get_lineages_by_taxid(taxids_list, rankedlineage_path):
+    """
+    Takes a list of taxIDs and the path to the NCBI rankedlineage.dmp file as the input. Returns the lineage corresponding to each taxID
+    """
+    lineages_dict = dict()
+    rankedlineage_col_names = (
+        "taxid",
+        "fcs_gx_name",
+        "fcs_gx_species",
+        "fcs_gx_genus",
+        "fcs_gx_family",
+        "fcs_gx_order",
+        "fcs_gx_class",
+        "fcs_gx_phylum",
+        "fcs_gx_kingdom",
+        "fcs_gx_domain",
+    )
+    if os.path.isfile(rankedlineage_path) is False:
+        sys.stderr.write(
+            f"The NCBI rankedlineage.dmp file was not found at the expected location ({rankedlineage_path})\n"
+        )
+        sys.exit(1)
+    rankedlineage_data = gpf.ll(rankedlineage_path)
+    for line in rankedlineage_data:
+        split_line = line.split("|")
+        split_line = [n.strip() for n in split_line]
+        assert len(split_line) == 11
+        taxid = split_line[0]
+        if taxid in taxids_list:
+            current_lineage_dict = dict()
+            for i in range(1, 10):
+                current_lineage_dict[rankedlineage_col_names[i]] = split_line[i]
+            lineages_dict[taxid] = current_lineage_dict
+    return lineages_dict
+
+
+def main(fcs_gx_reports_folder, ncbi_rankedlineage_path):
+    taxonomy_file, report_file = get_fcs_gx_outfile_paths(fcs_gx_reports_folder)
+    collection_dict = load_taxids_data(taxonomy_file)
+    collection_dict = load_report_data(report_file, collection_dict)
+    taxids_list = get_taxids_list(taxonomy_file)
+    lineages_dict = get_lineages_by_taxid(taxids_list, ncbi_rankedlineage_path)
+    rankedlineage_col_names = (
+        "taxid",
+        "fcs_gx_name",
+        "fcs_gx_species",
+        "fcs_gx_genus",
+        "fcs_gx_family",
+        "fcs_gx_order",
+        "fcs_gx_class",
+        "fcs_gx_phylum",
+        "fcs_gx_kingdom",
+        "fcs_gx_domain",
+    )
+
+    out_header = "scaff,fcs_gx_top_tax_name,fcs_gx_top_taxid,fcs_gx_div,fcs_gx_coverage_by_div,fcs_gx_coverage_by_tax,fcs_gx_score,fcs_gx_multiple_divs_per_scaff,fcs_gx_action"
+    out_header += "," + ",".join(rankedlineage_col_names)
+    print(out_header)
+    for scaff, row_dict in collection_dict.items():
+        out_line = "{},{},{},{},{},{},{},{},{}".format(
+            scaff,
+            row_dict["fcs_gx_top_tax_name"],
+            row_dict["fcs_gx_top_taxid"],
+            row_dict["fcs_gx_div"],
+            row_dict["fcs_gx_coverage_by_div"],
+            row_dict["fcs_gx_coverage_by_tax"],
+            row_dict["fcs_gx_score"],
+            row_dict["fcs_gx_multiple_divs_per_scaff"],
+            row_dict["fcs_gx_action"],
+        )
+        row_top_taxid = row_dict["fcs_gx_top_taxid"]
+        if row_top_taxid in lineages_dict:
+            current_lineage_dict = lineages_dict[row_dict["fcs_gx_top_taxid"]]
+            for i in range(1, 10):
+                out_line += f",{current_lineage_dict[rankedlineage_col_names[i]]}"
+                # current_lineage_dict[rankedlineage_col_names[i]] = split_line[i]
+        else:
+            for i in range(1, 10):
+                out_line += ","
+        print(out_line)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument(
+        "fcs_gx_reports_folder",
+        type=str,
+        help="Path to directory with FCS-GX output files (*.taxonomy.rpt and *.fcs_gx_report.txt)",
+    )
+    parser.add_argument("ncbi_rankedlineage_path", type=str, help="Path to the rankedlineage.dmp of NCBI taxonomy")
+    parser.add_argument("-v", action="version", version="1.0")
+    args = parser.parse_args()
+    main(args.fcs_gx_reports_folder, args.ncbi_rankedlineage_path)
diff --git a/modules.json b/modules.json
@@ -35,6 +35,11 @@
                         "git_sha": "5a35af8b60d45425c4b9193e567d16b614d93dbe",
                         "installed_by": ["modules"]
                     },
+                    "fcs/fcsgx": {
+                        "branch": "master",
+                        "git_sha": "8c4542e5d421c4690cf1fa6ec729e9304763fdaf",
+                        "installed_by": ["modules"]
+                    },
                     "kraken2/kraken2": {
                         "branch": "master",
                         "git_sha": "603ecbd9f45300c9788f197d2a15a005685b4220",

diff --git a/modules/local/parse_fcsgx_result.nf b/modules/local/parse_fcsgx_result.nf
@@ -0,0 +1,40 @@
+process PARSE_FCSGX_RESULT {
+    tag "${meta.id}"
+    label 'process_low'
+
+    conda "conda-forge::python=3.9"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/python:3.9' :
+        'biocontainers/python:3.9' }"
+
+    input:
+    tuple val(meta), path(fcs_gx_reports_folder)
+    path ncbi_rankedlineage_path
+
+    output:
+    tuple val(meta), path( "*.csv" ), emit: fcsgxresult
+    path "versions.yml", emit: versions
+
+    script:
+    def args = task.ext.args ?: ''
+    def prefix = task.ext.prefix ?: "${meta.id}"
+    """
+    parse_fcsgx_result.py ${fcs_gx_reports_folder} ${ncbi_rankedlineage_path} > parsed_fcsgx.csv
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        parse_fcsgx_result: \$(parse_fcsgx_result.py -v)
+    END_VERSIONS
+    """
+
+    stub:
+    """
+    touch parsed_fcsgx.csv
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        python: \$(python --version | sed 's/Python //g')
+        parse_fcsgx_result: \$(parse_fcsgx_result.py -v)
+    END_VERSIONS
+    """
+}