sanger-tol · DLBPointon · Nov 20, 2023 · Nov 16, 2023 · Nov 16, 2023 · Nov 16, 2023
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -82,7 +82,7 @@ jobs:
       - name: Download the vecscreen test data
         run: |
           mkdir vecscreen
-          wget -c https://tolit.cog.sanger.ac.uk/test-data/resources/vecscreen/GCA_900155405.1_PRJEB18760_genomic.fna -O vecscreen/GCA_900155405.1_PRJEB18760_genomic.fna
+          curl -L https://ftp.ncbi.nlm.nih.gov/blast/db/v4/16SMicrobial_v4.tar.gz | tar -C vecscreen -xzf -
 
       - name: Run pipeline with test data
         # TODO nf-core: You can customise CI pipeline run tests as required

diff --git a/assets/github_testing/test.yaml b/assets/github_testing/test.yaml
@@ -20,7 +20,7 @@ busco_lineages_folder: /home/runner/work/ascc/ascc/busco_database/lineages
 fcs_gx_database_path: /home/runner/work/ascc/ascc/FCS_gx/
 diamond_uniprot_database_path: /home/runner/work/ascc/ascc/diamond/UP000000212_1234679_tax.dmnd
 diamond_nr_database_path: /home/runner/work/ascc/ascc/diamond/UP000000212_1234679_tax.dmnd
-vecscreen_database_path: /home/runner/work/ascc/ascc/vecscreen/GCA_900155405.1_PRJEB18760_genomic.fna
+vecscreen_database_path: /home/runner/work/ascc/ascc/vecscreen
 seqkit:
   sliding: 6000
   window: 100000
diff --git a/assets/test.yaml b/assets/test.yaml
@@ -1,9 +1,9 @@
-assembly_path: /lustre/scratch124/tol/projects/tol/data/insects/Polyommatus_atlantica/assembly/draft/treeval/ilPolAtla1_merged/raw/ref.fa
+assembly_path: /lustre/scratch123/tol/teams/tola/users/ea10/pipeline_testing/20231114_pyoelii_vecscreen/ref/PlasmoDB-58_Pyoeliiyoelii17XNL_Genome_with_adapters2_fh2.fasta
 assembly_title: asccTinyTest
 reads_path: /lustre/scratch123/tol/resources/treeval/treeval-testdata/asccTinyTest/pacbio/
 reads_type: "hifi"
 pacbio_barcodes: /nfs/treeoflife-01/teams/tola/users/dp24/ascc/assets/pacbio_adaptors.fa
-pacbio_multiplexing_barcode_names: "bc1008,bc1009"
+pacbio_multiplexing_barcode_names: "bc2008,bc2009"
 sci_name: "Plasmodium yoelii yoelii 17XNL"
 taxid: 352914
 mito_fasta_path: /nfs/treeoflife-01/teams/tola/users/dp24/ascc/asccTinyTest/organellar/Pyoeliiyoelii17XNL_mitochondrion_ncbi.fa
@@ -17,7 +17,7 @@ ncbi_taxonomy_path: /lustre/scratch123/tol/teams/tola/users/ea10/databases/taxdu
 ncbi_rankedlineage_path: /lustre/scratch123/tol/teams/tola/users/ea10/databases/taxdump/rankedlineage.dmp
 busco_lineages_folder: /lustre/scratch123/tol/resources/busco/data/v5/2021-08-27/lineages
 fcs_gx_database_path: /lustre/scratch124/tol/projects/asg/sub_projects/ncbi_decon/0.4.0/gxdb
-vecscreen_database_path: /lustre/scratch123/tol/teams/tola/users/ea10/ascc_databases/vecscreen_database/adaptors_for_screening_euks.fa
+vecscreen_database_path: /lustre/scratch123/tol/teams/tola/users/ea10/ascc_databases/vecscreen_database
 diamond_uniprot_database_path: /lustre/scratch123/tol/teams/tola/users/ea10/ascc_databases/uniprot/uniprot_reference_proteomes_with_taxonnames.dmnd
 diamond_nr_database_path: /lustre/scratch123/tol/resources/nr/latest/nr.dmnd
 seqkit:

diff --git a/bin/VSlistTo1HitPerLine.py b/bin/VSlistTo1HitPerLine.py
@@ -0,0 +1,95 @@
+#!/usr/bin/env python3
+"""
+This script is based on an AWK script from NCBI: https://ftp.ncbi.nlm.nih.gov/pub/kitts/VSlistTo1HitPerLine.awk, orginally by Paul Kitts.
+Converted to Python by Eerik Aunin.
+
+This script converts the VecScreen text list output to one line giving the coordinates for each vector segment in the format:
+VecScreen_Category   ID_string   start_position   end_position
+The default is to report Strong, Moderate, and Weak matches and also segments of Suspect Origin. Reporting of any category can be suppressed by including 
+--skip_reporting_suspect_hits, --skip_reporting_weak_hits, --skip_reporting_moderate_hits or --skip_reporting_strong_hits on the command line. 
+"No hits" will be reported for any Query sequence that had no matches in any of the selected categories, unless --skip_reporting_no_hits is included on the command line.
+VecScreen errors will be reported unless --skip_reporting_errors is included on the command line.
+Usage:
+VSlistTo1HitPerLine.py [options] vecscreen_output_file
+"""
+
+import re
+import argparse
+
+
+def main(args):
+    hits_to_report = ""
+    ID = ""
+    hits = 0
+    error_found = 0
+
+    with open(args.vecscreen_output_file, "r") as file:
+        for line in file:
+            line = line.strip()
+            Fld = line.split(" ")
+
+            if not re.match(r"^[0-9 \t]+$", line):
+                hits_to_report = ""
+
+            if hits_to_report:
+                print(f"VecScreen_{hits_to_report.ljust(8)}\t{ID}\t{line}")
+                hits += 1
+                continue
+
+            if line.startswith(">Vector "):
+                if ID != "":
+                    if error_found:
+                        print(f"VecScreen_ERROR   \t{ID}")
+                    elif hits == 0 and args.skip_reporting_no_hits is False:
+                        print(f"VecScreen_No_Hits \t{ID}")
+                ID = Fld[1]
+                hits = 0
+                error_found = 0
+                continue
+
+            if args.skip_reporting_strong_hits is False and line.startswith("Strong"):
+                hits_to_report = "Strong"
+                continue
+
+            if args.skip_reporting_moderate_hits is False and line.startswith("Moderate"):
+                hits_to_report = "Moderate"
+                continue
+
+            if args.skip_reporting_weak_hits is False and line.startswith("Weak"):
+                hits_to_report = "Weak"
+                continue
+
+            if args.skip_reporting_suspect_hits is False and line.startswith("Suspect"):
+                hits_to_report = "Suspect"
+                continue
+
+            if args.skip_reporting_errors is False:
+                if line.startswith("ERROR") or line.startswith("WARNING"):
+                    error_found += 1
+
+    if ID != "":
+        if error_found > 0:
+            print(f"VecScreen_ERROR   \t{ID}")
+        elif hits == 0 and args.skip_reporting_no_hits is False:
+            print(f"VecScreen_No_Hits \t{ID}")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Reformatting VecScreen's output")
+    parser.add_argument(
+        "vecscreen_output_file",
+        type=str,
+        help="Path to a raw output file from NCBI VecScreen (from a run with the -f3 flag)",
+        default=None,
+    )
+    parser.add_argument("--skip_reporting_strong_hits", action="store_true", help="Skip reporting strong hits")
+    parser.add_argument("--skip_reporting_moderate_hits", action="store_true", help="Skip reporting moderate hits")
+    parser.add_argument("--skip_reporting_weak_hits", action="store_true", help="Skip reporting weak hits")
+    parser.add_argument(
+        "--skip_reporting_suspect_hits", action="store_true", help="Skip reporting hits of suspect origin"
+    )
+    parser.add_argument("--skip_reporting_no_hits", action="store_true", help="Skip reporting no-hits")
+    parser.add_argument("--skip_reporting_errors", action="store_true", help="Skip reporting errors")
+    parser.add_argument("-v", "--version", action="version", version="1.0")
+    args = parser.parse_args()
+    main(args)
diff --git a/bin/chunk_assembly_for_vecscreen.py b/bin/chunk_assembly_for_vecscreen.py
@@ -0,0 +1,59 @@
+#!/usr/bin/env python3
+"""
+Script for chunking an assembly before running NCBI VecScreen. Adapted from a script by James Torrance.
+The script was further refactored by Eerik Aunin and Yumi Sims
+"""
+
+from Bio import SeqIO
+import argparse
+import os
+
+
+def process_record(record, chunk_num, threshold_length, overlap_length):
+    return (
+        record[chunk_num * threshold_length : ((chunk_num + 1) * threshold_length + overlap_length)]
+        if chunk_num * threshold_length < len(record) - (threshold_length + overlap_length)
+        else record[chunk_num * threshold_length :]
+    )
+
+
+def generate_records_to_write(record, threshold_length, overlap_length):
+    return [
+        (
+            record_slice.id + ".chunk_{}".format(chunk_num + 1),
+            "",
+            record_slice,
+        )
+        for chunk_num, record_slice in enumerate(
+            process_record(record, i, threshold_length, overlap_length)
+            for i in range((len(record) - 1) // threshold_length + 1)
+        )
+    ]
+
+
+def main(fasta_input_file, fasta_output_file):
+    fasta_input_file = os.path.abspath(fasta_input_file)
+    fasta_output_file = os.path.abspath(fasta_output_file)
+
+    threshold_length = 500000
+    overlap_length = int(threshold_length / 10)
+    minimum_record_size = 11
+
+    try:
+        with open(fasta_output_file, "w") as fasta_output_handle:
+            for record in SeqIO.parse(fasta_input_file, "fasta"):
+                if len(record) >= minimum_record_size:
+                    records_to_write = generate_records_to_write(record, threshold_length, overlap_length)
+                    SeqIO.write(records_to_write, fasta_output_handle, "fasta")
+
+    except Exception as e:
+        print("An error occurred: {}".format(e))
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("fasta_input_file", type=str, help="Path to input FASTA file")
+    parser.add_argument("fasta_output_file", type=str, help="Path for FASTA output file")
+    parser.add_argument("-v", "--version", action="version", version="1.0")
+    args = parser.parse_args()
+    main(args.fasta_input_file, args.fasta_output_file)
diff --git a/bin/parse_fcsgx_result.py b/bin/parse_fcsgx_result.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python3
 """
 Script for parsing the result files of FCS-GX, originally written by Eerik Aunin (ea10)
-further minor modifications by Yumi Sims (yy5)
+further refactoring/modifications by Yumi Sims (yy5)
 """
 
 import general_purpose_functions as gpf
@@ -42,51 +42,50 @@ def load_taxids_data(taxonomy_file):
     """
     Parses the *.taxonomy.rpt to find taxids that correspond to species names. Returns this as a dictionary
     """
-    taxonomy_data = gpf.l(taxonomy_file)
-    assert len(taxonomy_data) > 2
-    taxonomy_data = taxonomy_data[2 : len(taxonomy_data)]
+    taxonomy_data = gpf.l(taxonomy_file)[2:]
+    assert taxonomy_data, "Taxonomy data is empty"
+
     collection_dict = OrderedDict()
 
     for line in taxonomy_data:
-        add_new_entry = False
-        multiple_divs_per_scaff = False
         split_line = line.split("\t")
         assert len(split_line) == 34
-        scaff = split_line[0]
-        if "~" in scaff:
-            scaff = scaff.split("~")[0]
-        tax_name_1 = split_line[5]
-        tax_id_1 = split_line[6]
-        div_1 = split_line[7]
-        cvg_by_div_1 = split_line[8]
-        if cvg_by_div_1 != "":
-            cvg_by_div_1 = int(cvg_by_div_1)
-        cvg_by_tax_1 = split_line[9]
-        if cvg_by_tax_1 != "":
-            cvg_by_tax_1 = int(cvg_by_tax_1)
-        score_1 = split_line[10]
-        if score_1 != "":
-            score_1 = int(score_1)
-        else:
-            score_1 = 0
-        if scaff in collection_dict:
-            old_score = collection_dict[scaff]["fcs_gx_score"]
-            if old_score < score_1:
-                add_new_entry = True
-                multiple_divs_per_scaff = True
-        else:
-            add_new_entry = True
-        if add_new_entry is True:
-            row_dict = dict()
-            row_dict["fcs_gx_top_tax_name"] = tax_name_1
-            row_dict["fcs_gx_top_taxid"] = tax_id_1
-            row_dict["fcs_gx_div"] = div_1
-            row_dict["fcs_gx_coverage_by_div"] = cvg_by_div_1
-            row_dict["fcs_gx_coverage_by_tax"] = cvg_by_tax_1
-            row_dict["fcs_gx_score"] = score_1
-            row_dict["fcs_gx_multiple_divs_per_scaff"] = multiple_divs_per_scaff
-            row_dict["fcs_gx_action"] = "NA"
+
+        scaff = split_line[0].split("~")[0]
+        tax_name_1, tax_id_1, div_1, cvg_by_div_1, cvg_by_tax_1, score_1 = (
+            split_line[5],
+            split_line[6],
+            split_line[7],
+            int(split_line[8]) if split_line[8] else None,
+            int(split_line[9]) if split_line[9] else None,
+            int(split_line[10]) if split_line[10] else 0,
+        )
+
+        if scaff in collection_dict and collection_dict[scaff]["fcs_gx_score"] < score_1:
+            row_dict = {
+                "fcs_gx_top_tax_name": tax_name_1,
+                "fcs_gx_top_taxid": tax_id_1,
+                "fcs_gx_div": div_1,
+                "fcs_gx_coverage_by_div": cvg_by_div_1,
+                "fcs_gx_coverage_by_tax": cvg_by_tax_1,
+                "fcs_gx_score": score_1,
+                "fcs_gx_multiple_divs_per_scaff": True,
+                "fcs_gx_action": "NA",
+            }
             collection_dict[scaff] = row_dict
+        elif scaff not in collection_dict:
+            row_dict = {
+                "fcs_gx_top_tax_name": tax_name_1,
+                "fcs_gx_top_taxid": tax_id_1,
+                "fcs_gx_div": div_1,
+                "fcs_gx_coverage_by_div": cvg_by_div_1,
+                "fcs_gx_coverage_by_tax": cvg_by_tax_1,
+                "fcs_gx_score": score_1,
+                "fcs_gx_multiple_divs_per_scaff": False,
+                "fcs_gx_action": "NA",
+            }
+            collection_dict[scaff] = row_dict
+
     return collection_dict
 
 

diff --git a/bin/summarise_vecscreen_output.py b/bin/summarise_vecscreen_output.py
@@ -0,0 +1,62 @@
+#!/usr/bin/env python3
+"""
+Script for reformatting output from VecScreen that has been processed using the VSlistTo1HitPerLine.awk script. 
+This script removes no-hit entries from the report file and converts coordinates in assembly chunks to coordinates in the full assembly.
+The original script was written in Perl by James Torrance. Rewritten by Eerik Aunin.
+"""
+
+import argparse
+import os
+import sys
+
+
+def main(vecscreen_file, chunk_size):
+    if os.path.isfile(vecscreen_file) is False:
+        sys.stderr.write(f"The input file for this script ({vecscreen_file}) was not found\n")
+        sys.exit(1)
+
+    with open(vecscreen_file) as f:
+        vecscreen_data = f.readlines()
+
+    output_lines = list()
+
+    for line in vecscreen_data:
+        if line.startswith("VecScreen_No_Hits") is True:
+            continue
+        split_line = line.split()
+        assert len(split_line) == 4
+        vecscreen_result, seq_name, seq_start, seq_end = split_line
+        seq_start = int(seq_start)
+        seq_end = int(seq_end)
+        seq_base_name = seq_name
+        if ".chunk_" in seq_name:
+            split_seq_name = seq_name.split(".chunk_")
+            assert len(split_seq_name) == 2
+            seq_base_name = split_seq_name[0]
+            chunk_suffix = int(split_seq_name[1])
+            seq_start += chunk_size * (chunk_suffix - 1)
+            seq_end += chunk_size * (chunk_suffix - 1)
+        corrected_line = "\t".join((vecscreen_result, seq_base_name, str(seq_start), str(seq_end)))
+        if corrected_line not in output_lines:
+            output_lines.append(corrected_line)
+
+    for line in output_lines:
+        print(line)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument(
+        "vecscreen_file",
+        type=str,
+        help="Path to output file of VecScreen (run with -f3 flag), filtered with VSlistTo1HitPerLine.awk",
+    )
+    parser.add_argument(
+        "--chunk_size",
+        type=int,
+        help="Chunk size of the chunked FASTA file that VecScreen was run with, in bp. Default: 500000",
+        default=50000,
+    )
+    parser.add_argument("-v", "--version", action="version", version="1.0")
+    args = parser.parse_args()
+    main(args.vecscreen_file, args.chunk_size)
diff --git a/conf/modules.config b/conf/modules.config
@@ -95,4 +95,11 @@ process {
             ext.prefix  = { "${meta.id}_alignment_${reference.getName().tokenize('.')[0]}" }
     }
 
+    withName: NCBITOOLS_VECSCREEN {
+        ext.args = { "-f3" }
+    }
+
+    withName: FILTER_VECSCREEN_RESULTS {
+        ext.args    =   "--skip_reporting_suspect_hits --skip_reporting_weak_hits --skip_reporting_no_hits"
+    }
 }
diff --git a/modules.json b/modules.json
@@ -73,6 +73,11 @@
                         "git_sha": "8fc1d24c710ebe1d5de0f2447ec9439fd3d9d66a",
                         "installed_by": ["modules"]
                     },
+                    "ncbitools/vecscreen": {
+                        "branch": "master",
+                        "git_sha": "1e4ac4aa2c612f9547f79f02ef7c651ccc9f657b",
+                        "installed_by": ["modules"]
+                    },
                     "samtools/depth": {
                         "branch": "master",
                         "git_sha": "8fc1d24c710ebe1d5de0f2447ec9439fd3d9d66a",