From ca6e3d6c16f179fde1091766ca88d1c05cf85572 Mon Sep 17 00:00:00 2001
From: eeaunin <eeaunin@gmail.com>
Date: Thu, 16 Nov 2023 17:02:11 +0000
Subject: [PATCH 01/23] added VecScreen related files

---
 bin/VSlistTo1HitPerLine.py                    |  87 +++++++++++
 bin/chunk_assembly_for_vecscreen.py           |  54 +++++++
 bin/summarise_vecscreen_output.py             |  53 +++++++
 bin/trim_Ns.py                                | 144 ++++++++++++++++++
 conf/modules.config                           |   4 +
 modules.json                                  |   5 +
 modules/local/chunk_assembly_for_vecscreen.nf |  41 +++++
 modules/local/detect_trailing_n.nf            |  42 +++++
 modules/local/filter_vecscreen_results.nf     |  41 +++++
 modules/local/summarise_vecscreen_output.nf   |  41 +++++
 modules/nf-core/ncbitools/vecscreen/main.nf   |  46 ++++++
 modules/nf-core/ncbitools/vecscreen/meta.yml  |  50 ++++++
 subworkflows/local/run_vecscreen.nf           |  34 +++++
 13 files changed, 642 insertions(+)
 create mode 100755 bin/VSlistTo1HitPerLine.py
 create mode 100755 bin/chunk_assembly_for_vecscreen.py
 create mode 100755 bin/summarise_vecscreen_output.py
 create mode 100755 bin/trim_Ns.py
 create mode 100644 modules/local/chunk_assembly_for_vecscreen.nf
 create mode 100644 modules/local/detect_trailing_n.nf
 create mode 100644 modules/local/filter_vecscreen_results.nf
 create mode 100644 modules/local/summarise_vecscreen_output.nf
 create mode 100644 modules/nf-core/ncbitools/vecscreen/main.nf
 create mode 100644 modules/nf-core/ncbitools/vecscreen/meta.yml
 create mode 100644 subworkflows/local/run_vecscreen.nf

diff --git a/bin/VSlistTo1HitPerLine.py b/bin/VSlistTo1HitPerLine.py
new file mode 100755
index 0000000..55b8ca7
--- /dev/null
+++ b/bin/VSlistTo1HitPerLine.py
@@ -0,0 +1,87 @@
+#!/usr/bin/env python3
+"""
+This script is based on an AWK script from NCBI: https://ftp.ncbi.nlm.nih.gov/pub/kitts/VSlistTo1HitPerLine.awk, orginally by Paul Kitts.
+Converted to Python by Eerik Aunin.
+
+This script converts the VecScreen text list output to one line giving the coordinates for each vector segment in the format:
+VecScreen_Category   ID_string   start_position   end_position
+The default is to report Strong, Moderate, and Weak matches and also segments of Suspect Origin. Reporting of any category can be suppressed by including 
+--skip_reporting_suspect_hits, --skip_reporting_weak_hits, --skip_reporting_moderate_hits or --skip_reporting_strong_hits on the command line. 
+"No hits" will be reported for any Query sequence that had no matches in any of the selected categories, unless --skip_reporting_no_hits is included on the command line.
+VecScreen errors will be reported unless --skip_reporting_errors is included on the command line.
+Usage:
+VSlistTo1HitPerLine.py [options] vecscreen_output_file
+"""
+
+import re
+import argparse
+
+def main(args):
+    hits_to_report = ""
+    ID = ""
+    hits = 0
+    error_found = 0
+
+    with open(args.vecscreen_output_file, "r") as file:
+        for line in file:
+            line = line.strip()
+            Fld = line.split(" ")
+
+            if not re.match(r'^[0-9 \t]+$', line):
+                hits_to_report = ""
+
+            if hits_to_report:
+                print(f"VecScreen_{hits_to_report.ljust(8)}\t{ID}\t{line}")
+                hits += 1
+                continue
+            
+            if line.startswith(">Vector "):
+                if ID != "":
+                    if error_found:
+                        print(f"VecScreen_ERROR   \t{ID}")
+                    elif hits == 0 and args.skip_reporting_no_hits is False:
+                        print(f"VecScreen_No_Hits \t{ID}")
+                ID = Fld[1]
+                hits = 0
+                error_found = 0
+                continue
+
+            if args.skip_reporting_strong_hits is False and line.startswith("Strong"):
+                hits_to_report = "Strong"
+                continue
+
+            if args.skip_reporting_moderate_hits is False and line.startswith("Moderate"):
+                hits_to_report = "Moderate"
+                continue
+
+            if args.skip_reporting_weak_hits is False and line.startswith("Weak"):
+                hits_to_report = "Weak"
+                continue
+
+            if args.skip_reporting_suspect_hits is False and line.startswith("Suspect"):
+                hits_to_report = "Suspect"
+                continue
+
+            if args.skip_reporting_errors is False:
+                if line.startswith("ERROR") or line.startswith("WARNING"):
+                    error_found += 1
+
+    if ID != "":
+        if error_found > 0:
+            print(f"VecScreen_ERROR   \t{ID}")
+        elif hits == 0 and args.skip_reporting_no_hits is False:
+            print(f"VecScreen_No_Hits \t{ID}")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Reformatting VecScreen's output")
+    parser.add_argument("vecscreen_output_file", type=str, help="Path to a raw output file from NCBI VecScreen (from a run with the -f3 flag)", default=None)
+    parser.add_argument("--skip_reporting_strong_hits", action="store_true", help="Skip reporting strong hits")
+    parser.add_argument("--skip_reporting_moderate_hits", action="store_true", help="Skip reporting moderate hits")
+    parser.add_argument("--skip_reporting_weak_hits", action="store_true", help="Skip reporting weak hits")
+    parser.add_argument("--skip_reporting_suspect_hits", action="store_true", help="Skip reporting hits of suspect origin")
+    parser.add_argument("--skip_reporting_no_hits", action="store_true", help="Skip reporting no-hits")
+    parser.add_argument("--skip_reporting_errors", action="store_true", help="Skip reporting errors")
+    parser.add_argument("-v", "--version", action="version", version="1.0")
+    args = parser.parse_args()
+    main(args)
\ No newline at end of file
diff --git a/bin/chunk_assembly_for_vecscreen.py b/bin/chunk_assembly_for_vecscreen.py
new file mode 100755
index 0000000..5721745
--- /dev/null
+++ b/bin/chunk_assembly_for_vecscreen.py
@@ -0,0 +1,54 @@
+#!/usr/bin/env python3
+"""
+Script for chunking an assembly before running NCBI VecScreen. Adapted from a script by James Torrance, edited by Eerik Aunin
+"""
+
+from Bio import SeqIO
+import argparse
+import os
+
+def main(fasta_input_file, fasta_output_file):
+	fasta_input_file = os.path.abspath(fasta_input_file)
+	fasta_output_file = os.path.abspath(fasta_output_file)
+
+	threshold_length = 500000
+	overlap_length = int( threshold_length / 10 )
+	minimum_record_size = 11
+
+	fasta_output_handle = open(fasta_output_file, 'w')
+
+	with open(fasta_input_file, 'r') as fasta_input_handle:
+		for record in SeqIO.parse(fasta_input_handle, "fasta"):
+
+			if len(record) >= minimum_record_size:
+				records_to_write = []
+
+				slice_count = 0
+				while (slice_count * threshold_length) < len(record) - (threshold_length+overlap_length):
+					record_slice = record[(slice_count*threshold_length):((slice_count+1)*threshold_length + overlap_length)]
+					record_slice.id += '.chunk_' + str(slice_count+1)
+
+					record_slice.description = ''
+					records_to_write.append(record_slice)
+					slice_count += 1
+
+				final_record_slice = record[(slice_count*threshold_length):]
+
+				if slice_count > 0:
+					final_record_slice.id += '.chunk_' + str(slice_count+1)
+				final_record_slice.description = ''
+
+				records_to_write.append(final_record_slice)
+
+				SeqIO.write(records_to_write, fasta_output_handle, 'fasta')
+
+	fasta_output_handle.close()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("fasta_input_file", type=str, help="Path to input FASTA file")
+    parser.add_argument("fasta_output_file", type=str, help="Path for FASTA output file")
+    parser.add_argument("-v", "--version", action="version", version="1.0")
+    args = parser.parse_args()
+    main(args.fasta_input_file, args.fasta_output_file)
diff --git a/bin/summarise_vecscreen_output.py b/bin/summarise_vecscreen_output.py
new file mode 100755
index 0000000..a513f53
--- /dev/null
+++ b/bin/summarise_vecscreen_output.py
@@ -0,0 +1,53 @@
+#!/usr/bin/env python3
+"""
+Script for reformatting output from VecScreen that has been processed using the VSlistTo1HitPerLine.awk script. 
+This script removes no-hit entries from the report file and converts coordinates in assembly chunks to coordinates in the full assembly.
+The original script was written in Perl by James Torrance. Rewritten by Eerik Aunin.
+"""
+
+import argparse
+import os
+import sys
+
+
+def main(vecscreen_file, chunk_size):
+    if os.path.isfile(vecscreen_file) is False:
+        sys.stderr.write(f"The input file for this script ({vecscreen_file}) was not found\n")
+        sys.exit(1)
+
+    with open(vecscreen_file) as f:
+        vecscreen_data = f.readlines()
+
+    output_lines = list()
+
+    for line in vecscreen_data:
+        if line.startswith("VecScreen_No_Hits") is True:
+            continue
+        split_line = line.split()
+        assert len(split_line) == 4
+        vecscreen_result, seq_name, seq_start, seq_end = split_line
+        seq_start = int(seq_start)
+        seq_end = int(seq_end)
+        seq_base_name = seq_name
+        if ".chunk_" in seq_name:
+            split_seq_name = seq_name.split(".chunk_")
+            assert len(split_seq_name) == 2
+            seq_base_name = split_seq_name[0]
+            chunk_suffix = int(split_seq_name[1])
+            seq_start += chunk_size * (chunk_suffix - 1)
+            seq_end += chunk_size * (chunk_suffix - 1)
+        corrected_line = "\t".join((vecscreen_result, seq_base_name, str(seq_start), str(seq_end)))
+        if corrected_line not in output_lines:
+            output_lines.append(corrected_line)
+
+    for line in output_lines:
+        print(line)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("vecscreen_file", type=str, help="Path to output file of VecScreen (run with -f3 flag), filtered with VSlistTo1HitPerLine.awk")
+    parser.add_argument("--chunk_size", type=int, help="Chunk size of the chunked FASTA file that VecScreen was run with, in bp. Default: 500000", default=50000)
+    parser.add_argument("-v", "--version", action="version", version="1.0")
+    args = parser.parse_args()
+    main(args.vecscreen_file, args.chunk_size)
diff --git a/bin/trim_Ns.py b/bin/trim_Ns.py
new file mode 100755
index 0000000..3f20eb6
--- /dev/null
+++ b/bin/trim_Ns.py
@@ -0,0 +1,144 @@
+#!/usr/bin/env python3
+"""
+Script for detecting trailing Ns that should be trimmed from an assembly, from James Torrance. Edited by Eerik Aunin
+"""
+
+import re
+from Bio import SeqIO
+import argparse
+
+def main(fasta_file, output_file):
+
+	minleftover = 200   # after trimming start/end, at least this many bp should be left
+	winsize     = 5000  # for sliding window analysis
+	minslidingBase = 0.4   # maximum fraction of Ns in sliding window before alarm sets off
+
+	output_handle = open(output_file, 'w')
+	with open(fasta_file, "r") as fasta_input_handle:
+		for record in SeqIO.parse(fasta_input_handle, "fasta"):
+			# output_handle.write('Testing ' + record.id + "\n")
+
+			# Do this twice: once with the regular string, once with the reversed string
+			# The output should be as one or more variables that can be reversed for the second iteration
+			seq_string = str(record.seq)
+
+			n_count = seq_string.count('N') + seq_string.count('n')
+			n_perc = n_count/ len(seq_string)
+			if n_perc > 0.8:
+				output_handle.write('# WARNING: ' + record.id + '\t' + str(int(n_perc * 10000)/100) + ' % Ns of total ' + str(len(seq_string)) + "\n")
+
+			# Consider handling this via two separate regexps for start and end. Adding character-class might be escalating run-time
+			#terminal_n_match = re.match('^([Nn]*)(\S+?)([Nn]*)$', seq_string)
+			start_n_match = re.match('^([Nn]*)', seq_string)
+			end_n_match = re.search('^([Nn]*)', seq_string[::-1])
+
+			startseq = ''
+			if start_n_match:
+				startseq = start_n_match.group(1)
+			endseq = ''
+			if end_n_match:
+				endseq = end_n_match.group(1)
+			realseq_length = len(seq_string) - len(startseq) - len(endseq)
+			# Handle "all Ns exception"
+			if len(startseq) == len(seq_string):
+				realseq_length = 0
+				endseq = '' 
+
+			if len(startseq) > 0 or len(endseq) > 0:
+				if len(startseq) > 0:
+					output_handle.write('\t'.join(["TRIM:",record.id,'1',str(len(startseq))]) + "\t\n")
+				if len(endseq) > 0:
+					output_handle.write('\t'.join(["TRIM:",record.id,str(len(startseq)+realseq_length+1),str(len(seq_string))]) + "\t\n")
+
+			if realseq_length <= minleftover:
+				output_handle.write("REMOVE: " + record.id + '\t' + str(realseq_length) + " bp leftover after trimming\n")
+			
+
+			# Only attempt the windowing approach if there's an initial window that doesn't meet the trigger condition
+			for seq_string_loop, seq_string_for_window in enumerate([seq_string, seq_string[::-1]]):
+				if len(seq_string_for_window) > winsize and (seq_string_for_window[:winsize].count('N') + seq_string_for_window[:winsize].count('n')) > winsize * minslidingBase:
+
+					non_n_regions = []
+					non_n_iterator = re.finditer('[^Nn]+', seq_string_for_window)
+					for non_n_instance in non_n_iterator:
+						# output_handle.write(record.id + '\t' + str(non_n_instance.start(0)) + '\t' + str(non_n_instance.end(0)) + '\n')
+
+						current_non_n_start = non_n_instance.start(0) + 1
+						current_non_n_end = non_n_instance.end(0)
+
+						non_n_regions.insert(0, [current_non_n_start, current_non_n_end])
+
+						# Does the *end* of this block satisfy the window condition?
+						bases_in_window = 0
+						start_of_first_base_block_in_window = None
+						for non_n_region in non_n_regions:
+							if non_n_region[1] >= current_non_n_end - winsize:
+								start_of_first_base_block_in_window = non_n_region[0]
+								if non_n_region[0] >= current_non_n_end - winsize:
+									bases_in_window += non_n_region[1] - non_n_region[0] + 1
+								else:
+									bases_in_window += non_n_region[1] - non_n_region[0] + 1
+									break
+							else:
+								break
+
+						if bases_in_window >= minslidingBase * winsize:
+							# Because start positions have BED-style
+
+							# Remember that the blocks are in *reverse* order along the sequence
+							trackback_to_start_flag = False
+
+							for i, non_n_region in enumerate(non_n_regions):
+								if i == 0:
+									continue
+								bases_in_window_2 = 0
+								if non_n_region[1] < non_n_regions[0][0] - winsize:
+									break
+								else:
+									current_window_start = max(non_n_region[0], non_n_regions[0][0] - winsize)
+
+									# Count the bases from this block
+									bases_in_window_2 += min(non_n_region[1], (current_window_start+winsize-1) ) - current_window_start + 1
+
+									# Add up all the sequence in blocks tested thus far, but not the final block
+									for j in range(1, i):
+										bases_in_window_2 += non_n_regions[j][1] - non_n_regions[j][0] + 1
+
+									# Count the sequence in the final block that can contribute
+									# This is the region between the start of the final block and either the end of the block
+									# or the end of a window extending from the current test start point, whichever comes first
+									bases_in_window_2 += min( non_n_regions[0][1], (current_window_start + winsize-1) ) - non_n_regions[0][0] + 1
+
+									# output_handle.write('BIW: ' + str(i) + ' ' + str(bases_in_window_2) + "\n")
+									if bases_in_window_2 >= minslidingBase * winsize:
+										if current_window_start == non_n_region[0]:
+											start_of_first_base_block_in_window = current_window_start
+										else:
+											start_of_first_base_block_in_window = non_n_regions[i-1][0]
+									else:
+										# We keep going back until the threshold condition is *not* met
+										break
+
+									# If we find the break-point should be before the first block, then we don't want to trim at all!										
+									if i == len(non_n_regions) - 1:
+										trackback_to_start_flag = True
+
+
+							# Only trim if the breakpoint isn't before the first block
+							# and if the breakpoint isn't at the start of the sequence
+							if not(trackback_to_start_flag) and start_of_first_base_block_in_window != 1:
+								if seq_string_loop == 0:
+									output_handle.write("FWDCLIP:\t" + record.id + "\t1\t" + str(start_of_first_base_block_in_window - 1) + "\n")
+								else:
+									output_handle.write("REVCLIP:\t" + record.id + '\t' + str(len(seq_string) - start_of_first_base_block_in_window + 2) + '\t' + str(len(seq_string)) + "\n")
+
+							break
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("fasta_file", type=str, help="Path to input FASTA file")
+    parser.add_argument("output_file", type=str, help="Path for output report file")
+    parser.add_argument("-v", "--version", action="version", version="1.0")
+    args = parser.parse_args()
+    main(args.fasta_file, args.output_file)
\ No newline at end of file
diff --git a/conf/modules.config b/conf/modules.config
index 900ec4a..2febc01 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -95,4 +95,8 @@ process {
             ext.prefix  = { "${meta.id}_alignment_${reference.getName().tokenize('.')[0]}" }
     }
 
+    withName: NCBITOOLS_VECSCREEN {
+        ext.args = { "-f3" }
+    }
+
 }
diff --git a/modules.json b/modules.json
index 6a8a74a..f8c9ac1 100644
--- a/modules.json
+++ b/modules.json
@@ -73,6 +73,11 @@
                         "git_sha": "8fc1d24c710ebe1d5de0f2447ec9439fd3d9d66a",
                         "installed_by": ["modules"]
                     },
+                    "ncbitools/vecscreen": {
+                        "branch": "master",
+                        "git_sha": "1e4ac4aa2c612f9547f79f02ef7c651ccc9f657b",
+                        "installed_by": ["modules"]
+                    },
                     "samtools/depth": {
                         "branch": "master",
                         "git_sha": "8fc1d24c710ebe1d5de0f2447ec9439fd3d9d66a",
diff --git a/modules/local/chunk_assembly_for_vecscreen.nf b/modules/local/chunk_assembly_for_vecscreen.nf
new file mode 100644
index 0000000..727e04b
--- /dev/null
+++ b/modules/local/chunk_assembly_for_vecscreen.nf
@@ -0,0 +1,41 @@
+process CHUNK_ASSEMBLY_FOR_VECSCREEN {
+    tag "$meta.id"
+    label 'process_single'
+
+    conda "bioconda::biopython=1.70"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/biopython:1.70--np112py35_1':
+        'quay.io/biocontainers/biopython:1.70--np112py35_1' }"
+
+    input:
+    tuple val(meta), path(fasta_input_file)
+
+    output:
+    tuple val(meta), path("chunked_assembly.fa"), emit: chunked_assembly
+    path "versions.yml"           , emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    """
+    chunk_assembly_for_vecscreen.py $fasta_input_file chunked_assembly.fa
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        python: \$(python --version | sed 's/Python //g')
+        chunk_assembly_for_vecscreen.py: \$(chunk_assembly_for_vecscreen.py --version | cut -d' ' -f2)
+    END_VERSIONS
+    """
+
+    stub:
+    """
+    touch chunked_assembly.fa
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        python: \$(python --version | sed 's/Python //g')
+        chunk_assembly_for_vecscreen.py: \$(chunk_assembly_for_vecscreen.py --version | cut -d' ' -f2)
+    END_VERSIONS
+    """
+}
diff --git a/modules/local/detect_trailing_n.nf b/modules/local/detect_trailing_n.nf
new file mode 100644
index 0000000..0ddf966
--- /dev/null
+++ b/modules/local/detect_trailing_n.nf
@@ -0,0 +1,42 @@
+process DETECT_TRAILING_N {
+    tag "$meta.id"
+    label 'process_single'
+
+    conda "bioconda::biopython=1.70"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/biopython:1.70--np112py35_1':
+        'quay.io/biocontainers/biopython:1.70--np112py35_1' }"
+
+    input:
+    tuple val(meta), path(fasta_input_file)
+
+    output:
+    tuple val(meta), path("*_trim_Ns"), emit: trailing_n_report
+    path "versions.yml"           , emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def prefix = task.ext.prefix ?: "${meta.id}"
+    """
+    trim_Ns.py ${fasta_input_file} ${prefix}_trim_Ns
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        python: \$(python --version | sed 's/Python //g')
+        trim_Ns.py: \$(trim_Ns.py --version | cut -d' ' -f2)
+    END_VERSIONS
+    """
+
+    stub:
+    def prefix = task.ext.prefix ?: "${meta.id}"
+    """
+    touch ${prefix}_trim_Ns
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        python: \$(python --version | sed 's/Python //g')
+        trim_Ns.py: \$(trim_Ns.py --version | cut -d' ' -f2)
+    END_VERSIONS
+    """
+}
\ No newline at end of file
diff --git a/modules/local/filter_vecscreen_results.nf b/modules/local/filter_vecscreen_results.nf
new file mode 100644
index 0000000..38c701e
--- /dev/null
+++ b/modules/local/filter_vecscreen_results.nf
@@ -0,0 +1,41 @@
+process FILTER_VECSCREEN_RESULTS {
+    tag "$meta.id"
+    label 'process_single'
+
+    conda "conda-forge::python=3.9"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/python:3.9' :
+        'biocontainers/python:3.9' }"
+
+    input:
+    tuple val(meta), path(vecscreen_outfile)
+
+    output:
+    tuple val(meta), path("vecscreen.grepped.out"), emit: filtered_vecscreen_outfile
+    path "versions.yml"                           , emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    """
+    VSlistTo1HitPerLine.py --skip_reporting_suspect_hits --skip_reporting_weak_hits --skip_reporting_no_hits ${vecscreen_outfile} > vecscreen.grepped.out
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        python: \$(python --version | sed 's/Python //g')
+        VSlistTo1HitPerLine: \$(VSlistTo1HitPerLine.py -v)
+    END_VERSIONS
+    """
+
+    stub:
+    """
+    touch vecscreen.grepped.out
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        python: \$(python --version | sed 's/Python //g')
+        VSlistTo1HitPerLine: \$(VSlistTo1HitPerLine.py -v)
+    END_VERSIONS
+    """
+}
diff --git a/modules/local/summarise_vecscreen_output.nf b/modules/local/summarise_vecscreen_output.nf
new file mode 100644
index 0000000..3154c3e
--- /dev/null
+++ b/modules/local/summarise_vecscreen_output.nf
@@ -0,0 +1,41 @@
+process SUMMARISE_VECSCREEN_OUTPUT {
+    tag "$meta.id"
+    label 'process_single'
+
+    conda "conda-forge::python=3.9"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/python:3.9' :
+        'biocontainers/python:3.9' }"
+
+    input:
+    tuple val(meta), path(vecscreen_filtered_outfile)
+
+    output:
+    tuple val(meta), path("*.vecscreen_contamination"), emit: vecscreen_contamination
+    path "versions.yml"                               , emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def prefix = task.ext.prefix ?: "${meta.id}"
+    """
+    summarise_vecscreen_output.py ${vecscreen_filtered_outfile} > ${prefix}.vecscreen_contamination
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        python: \$(python --version | sed 's/Python //g')
+        summarise_vecscreen_output.py: \$(summarise_vecscreen_output.py --version | cut -d' ' -f2)
+    END_VERSIONS
+    """
+
+    stub:
+    def prefix = task.ext.prefix ?: "${meta.id}"
+    """
+    touch ${prefix}.vecscreen_contamination
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        python: \$(python --version | sed 's/Python //g')
+        summarise_vecscreen_output.py: \$(summarise_vecscreen_output.py --version | cut -d' ' -f2)
+    END_VERSIONS
+    """
+}
diff --git a/modules/nf-core/ncbitools/vecscreen/main.nf b/modules/nf-core/ncbitools/vecscreen/main.nf
new file mode 100644
index 0000000..a277eca
--- /dev/null
+++ b/modules/nf-core/ncbitools/vecscreen/main.nf
@@ -0,0 +1,46 @@
+process NCBITOOLS_VECSCREEN {
+    tag "$meta.id"
+    label 'process_single'
+
+    container "docker.io/biocontainers/ncbi-tools-bin:6.1.20170106-6-deb_cv2"
+
+    input:
+    tuple val(meta), path(fasta_file)
+    tuple val(adapters_database_meta), path(adapters_database_directory)
+
+    output:
+    tuple val(meta), path("${meta.id}.vecscreen.out")    , emit: vecscreen_output
+    path "versions.yml"                                  , emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    // Exit if running this module with -profile conda / -profile mamba
+    if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) {
+        error "The VecScreen module does not support Conda. Please use Docker / Singularity / Podman instead."
+    }
+    def args   = task.ext.args ?: ''
+    def prefix = task.ext.prefix ?: "${meta.id}"
+    // WARN: VecScreen doesn't output a version number and doesn't appear to have a Github repository. 1.0 is arbitrarily used here as the version number
+    """
+    DB=`find -L ${adapters_database_directory} -maxdepth 1 -name "*.nin" | sed 's/\\.nin\$//'`
+    vecscreen -d \$DB ${args} -i ${fasta_file} -o ${prefix}.vecscreen.out
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        vecscreen: 1.0
+    END_VERSIONS
+    """
+
+    stub:
+    // WARN: VecScreen doesn't output a version number and doesn't appear to have a Github repository. 1.0 is arbitrarily used here as the version number
+    """
+    touch ${prefix}.vecscreen.out
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        vecscreen: 1.0
+    END_VERSIONS
+    """
+}
diff --git a/modules/nf-core/ncbitools/vecscreen/meta.yml b/modules/nf-core/ncbitools/vecscreen/meta.yml
new file mode 100644
index 0000000..45f0f7e
--- /dev/null
+++ b/modules/nf-core/ncbitools/vecscreen/meta.yml
@@ -0,0 +1,50 @@
+---
+# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/yaml-schema.json
+name: "NCBITOOLS_VECSCREEN"
+description: NCBI tool for detecting vector contamination in nucleic acid sequences. This tool is older than NCBI's FCS-adaptor, which is for the same purpose
+keywords:
+  - assembly
+  - genomics
+  - quality control
+  - contamination
+  - vector
+  - NCBI
+tools:
+  - "ncbitools":
+      description: |
+        "NCBI libraries for biology applications (text-based utilities)"
+      homepage: "https://www.ncbi.nlm.nih.gov/tools/vecscreen/"
+      documentation: "https://www.ncbi.nlm.nih.gov/tools/vecscreen/interpretation/"
+      tool_dev_url: "https://www.ncbi.nlm.nih.gov/tools/vecscreen/"
+      licence: ["The Open Database License"]
+
+input:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. [ id:'test', taxid:'6973' ]
+  - fasta_file:
+      type: file
+      description: FASTA file that will be screened for contaminants
+  - adapters_database_file:
+      type: file
+      description: Path to a nucleotide BLAST database file with vector sequences
+
+output:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. [ id:'test', taxid:'9606' ]
+  - versions:
+      type: file
+      description: File containing software versions
+      pattern: "versions.yml"
+  - vecscreen_output:
+      type: file
+      description: VecScreen report file. This can be in different formats depending on the value of the optional -f  parameter. 0 = HTML format, with alignments. 1 = HTML format, no alignments. 2 = Text list, with alignments. 3 = Text list, no alignments. default = 0
+      pattern: "*.vecscreen.out"
+
+authors:
+  - "@eeaunin"
diff --git a/subworkflows/local/run_vecscreen.nf b/subworkflows/local/run_vecscreen.nf
new file mode 100644
index 0000000..645fe82
--- /dev/null
+++ b/subworkflows/local/run_vecscreen.nf
@@ -0,0 +1,34 @@
+#!/usr/bin/env nextflow
+// MODULE IMPORT BLOCK
+
+include { CHUNK_ASSEMBLY_FOR_VECSCREEN }    from '../../modules/local/chunk_assembly_for_vecscreen'
+include { NCBITOOLS_VECSCREEN }             from '../../modules/nf-core/ncbitools/vecscreen/main'
+include { FILTER_VECSCREEN_RESULTS }        from '../../modules/local/filter_vecscreen_results'
+include { SUMMARISE_VECSCREEN_OUTPUT }      from '../../modules/local/summarise_vecscreen_output'
+
+
+workflow RUN_VECSCREEN {
+    take:
+    reference_tuple         // val(meta), path(fasta)
+    adapters_blastn_database_directory
+
+    main:
+    ch_versions = Channel.empty()
+    // MODULE: CHUNKS THE ASSEMBLY INTO PIECES WITH FIXED LENGTH
+    CHUNK_ASSEMBLY_FOR_VECSCREEN(reference_tuple)
+    ch_versions = ch_versions.mix(CHUNK_ASSEMBLY_FOR_VECSCREEN.out.versions)
+    // MODULE: RUNS NCBI VECSCREEN
+    NCBITOOLS_VECSCREEN(CHUNK_ASSEMBLY_FOR_VECSCREEN.out.chunked_assembly, [ [id:'vecscreen_adapters_database'], adapters_blastn_database_directory ])
+    ch_versions = ch_versions.mix(NCBITOOLS_VECSCREEN.out.versions)
+    // MODULE: REFORMATS VECSCREEN OUTPUT AND FILTERS IT TO REMOVE NO-HIT RESULTS AND KEEP ONLY HITS
+    FILTER_VECSCREEN_RESULTS(NCBITOOLS_VECSCREEN.out.vecscreen_output)
+    ch_versions = ch_versions.mix(FILTER_VECSCREEN_RESULTS.out.versions)
+    // MODULE: CONVERTS COORDINATES IN ASSEMBLY CHUNKS BACK TO COORDINATES IN THE WHOLE ASSEMBLY AND WRITES A REPORT FILE
+    SUMMARISE_VECSCREEN_OUTPUT(FILTER_VECSCREEN_RESULTS.out.filtered_vecscreen_outfile)
+    ch_versions = ch_versions.mix(SUMMARISE_VECSCREEN_OUTPUT.out.versions)
+
+    emit:
+    vecscreen_contamination     = SUMMARISE_VECSCREEN_OUTPUT.out.vecscreen_contamination
+    versions                    = ch_versions.ifEmpty(null) // channel: [ versions.yml ]
+}
+

From 568e9635a39aee148527e720d6a4f7bc7a1581f4 Mon Sep 17 00:00:00 2001
From: eeaunin <eeaunin@gmail.com>
Date: Thu, 16 Nov 2023 17:09:53 +0000
Subject: [PATCH 02/23] Edited the main workflow and test yaml

---
 assets/test.yaml       |   2 +-
 workflows/ascc.nf      |  28 +++++
 workflows/ascc_full.nf | 246 +++++++++++++++++++++++++++++++++++++++++
 3 files changed, 275 insertions(+), 1 deletion(-)
 create mode 100644 workflows/ascc_full.nf

diff --git a/assets/test.yaml b/assets/test.yaml
index 73d0697..d418a26 100755
--- a/assets/test.yaml
+++ b/assets/test.yaml
@@ -17,7 +17,7 @@ ncbi_taxonomy_path: /lustre/scratch123/tol/teams/tola/users/ea10/databases/taxdu
 ncbi_rankedlineage_path: /lustre/scratch123/tol/teams/tola/users/ea10/databases/taxdump/rankedlineage.dmp
 busco_lineages_folder: /lustre/scratch123/tol/resources/busco/data/v5/2021-08-27/lineages
 fcs_gx_database_path: /lustre/scratch124/tol/projects/asg/sub_projects/ncbi_decon/0.4.0/gxdb
-vecscreen_database_path: /lustre/scratch123/tol/teams/tola/users/ea10/ascc_databases/vecscreen_database/adaptors_for_screening_euks.fa
+vecscreen_database_path: /lustre/scratch123/tol/teams/tola/users/ea10/ascc_databases/vecscreen_database
 diamond_uniprot_database_path: /lustre/scratch123/tol/teams/tola/users/ea10/ascc_databases/uniprot/uniprot_reference_proteomes_with_taxonnames.dmnd
 diamond_nr_database_path: /lustre/scratch123/tol/resources/nr/latest/nr.dmnd
 seqkit:
diff --git a/workflows/ascc.nf b/workflows/ascc.nf
index 220606f..8a98914 100644
--- a/workflows/ascc.nf
+++ b/workflows/ascc.nf
@@ -34,10 +34,12 @@ include { RUN_NT_KRAKEN                 } from '../subworkflows/local/run_nt_kra
 include { RUN_FCSGX                     } from '../subworkflows/local/run_fcsgx'
 include { PACBIO_BARCODE_CHECK          } from '../subworkflows/local/pacbio_barcode_check'
 include { RUN_READ_COVERAGE             } from '../subworkflows/local/run_read_coverage'
+include { RUN_VECSCREEN                 } from '../subworkflows/local/run_vecscreen'
 include { ORGANELLAR_BLAST as PLASTID_ORGANELLAR_BLAST  } from '../subworkflows/local/organellar_blast'
 include { ORGANELLAR_BLAST as MITO_ORGANELLAR_BLAST     } from '../subworkflows/local/organellar_blast'
 
 
+
 //
 // MODULE: Local modules
 //
@@ -95,10 +97,13 @@ workflow ASCC {
     //
     // SUBWORKFLOW: EXTRACT RESULTS HITS FROM TIARA
     //
+
+    /*
     EXTRACT_TIARA_HITS (
         GENERATE_GENOME.out.reference_tuple
     )
     ch_versions = ch_versions.mix(EXTRACT_TIARA_HITS.out.versions)
+    */
 
     //
     // LOGIC: INJECT SLIDING WINDOW VALUES INTO REFERENCE
@@ -129,54 +134,66 @@ workflow ASCC {
     //
     // LOGIC: CHECK WHETHER THERE IS A MITO AND BRANCH
     //
+
+    /*
     YAML_INPUT.out.mito_tuple
         .branch { meta, check ->
             valid:      check != "NO MITO"
             invalid:    check == "NO MITO"
         }
         .set { mito_check }
+    */
 
     //
     // SUBWORKFLOW: BLASTING FOR MITO ASSEMBLIES IN GENOME
     //
+    /*
     MITO_ORGANELLAR_BLAST (
         YAML_INPUT.out.reference_tuple,
         YAML_INPUT.out.mito_var,
         mito_check.valid
     )
     ch_versions = ch_versions.mix(MITO_ORGANELLAR_BLAST.out.versions)
+    */
 
     //
     // LOGIC: CHECK WHETHER THERE IS A PLASTID AND BRANCH
     //
+    /*
     YAML_INPUT.out.plastid_tuple
         .branch { meta, check ->
             valid:      check != "NO PLASTID"
             invalid:    check == "NO PLASTID"
         }
         .set { plastid_check }
+    */
 
     //
     // SUBWORKFLOW: BLASTING FOR PLASTID ASSEMBLIES IN GENOME
     //
+    /*
     PLASTID_ORGANELLAR_BLAST (
         YAML_INPUT.out.reference_tuple,
         YAML_INPUT.out.plastid_var,
         plastid_check.valid
     )
     ch_versions = ch_versions.mix(PLASTID_ORGANELLAR_BLAST.out.versions)
+    */
 
     //
     // SUBWORKFLOW:
     //
+    /*
     RUN_FCSADAPTOR (
         YAML_INPUT.out.reference_tuple
     )
     ch_versions = ch_versions.mix(RUN_FCSADAPTOR.out.versions)
+    */
 
     //
     // SUBWORKFLOW:
     //
+    /*
     RUN_FCSGX (
         YAML_INPUT.out.reference_tuple,
         YAML_INPUT.out.fcs_gx_database_path,
@@ -184,6 +201,7 @@ workflow ASCC {
         YAML_INPUT.out.ncbi_rankedlineage_path
     )
     ch_versions = ch_versions.mix(RUN_FCSADAPTOR.out.versions)
+    */
 
     //
     // SUBWORKFLOW: IDENTITY PACBIO BARCODES IN INPUT DATA
@@ -199,6 +217,7 @@ workflow ASCC {
     //
     // SUBWORKFLOW: CALCULATE AVERAGE READ COVERAGE
     //
+    /*
     RUN_READ_COVERAGE (
         YAML_INPUT.out.reference_tuple,
         YAML_INPUT.out.assembly_path,
@@ -206,10 +225,19 @@ workflow ASCC {
         YAML_INPUT.out.reads_type
     )
     ch_versions = ch_versions.mix(RUN_READ_COVERAGE.out.versions)
+    */
 
     //
     // SUBWORKFLOW: COLLECT SOFTWARE VERSIONS
     //
+
+    RUN_VECSCREEN (
+        GENERATE_GENOME.out.reference_tuple,
+        "/lustre/scratch123/tol/teams/tola/users/ea10/ascc_databases/vecscreen_database"
+        //YAML_INPUT.out.vecscreen_database_path
+    )
+    ch_versions = ch_versions.mix(RUN_VECSCREEN.out.versions)
+
     CUSTOM_DUMPSOFTWAREVERSIONS (
         ch_versions.unique().collectFile(name: 'collated_versions.yml')
     )
diff --git a/workflows/ascc_full.nf b/workflows/ascc_full.nf
new file mode 100644
index 0000000..3107122
--- /dev/null
+++ b/workflows/ascc_full.nf
@@ -0,0 +1,246 @@
+/*
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    PRINT PARAMS SUMMARY
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+*/
+
+include { paramsSummaryLog; paramsSummaryMap } from 'plugin/nf-validation'
+
+def logo = NfcoreTemplate.logo(workflow, params.monochrome_logs)
+def citation = '\n' + WorkflowMain.citation(workflow) + '\n'
+def summary_params = paramsSummaryMap(workflow)
+
+// Print parameter summary log to screen
+log.info logo + paramsSummaryLog(workflow) + citation
+
+WorkflowAscc.initialise(params, log)
+
+/*
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    IMPORT LOCAL MODULES/SUBWORKFLOWS
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+*/
+
+//
+// SUBWORKFLOW: Consisting of a mix of local and nf-core/modules
+//
+
+include { YAML_INPUT                    } from '../subworkflows/local/yaml_input'
+include { GENERATE_GENOME               } from '../subworkflows/local/generate_genome'
+include { EXTRACT_TIARA_HITS            } from '../subworkflows/local/extract_tiara_hits'
+include { EXTRACT_NT_BLAST              } from '../subworkflows/local/extract_nt_blast'
+include { RUN_FCSADAPTOR                } from '../subworkflows/local/run_fcsadaptor'
+include { RUN_NT_KRAKEN                 } from '../subworkflows/local/run_nt_kraken'
+include { RUN_FCSGX                     } from '../subworkflows/local/run_fcsgx'
+include { PACBIO_BARCODE_CHECK          } from '../subworkflows/local/pacbio_barcode_check'
+include { RUN_READ_COVERAGE             } from '../subworkflows/local/run_read_coverage'
+include { RUN_VECSCREEN                 } from '../subworkflows/local/run_vecscreen'
+include { ORGANELLAR_BLAST as PLASTID_ORGANELLAR_BLAST  } from '../subworkflows/local/organellar_blast'
+include { ORGANELLAR_BLAST as MITO_ORGANELLAR_BLAST     } from '../subworkflows/local/organellar_blast'
+
+
+
+//
+// MODULE: Local modules
+//
+include { GC_CONTENT                                    } from '../modules/local/gc_content'
+
+/*
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    IMPORT NF-CORE MODULES/SUBWORKFLOWS
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+*/
+
+//
+// MODULE: Installed directly from nf-core/modules
+//
+include { CUSTOM_DUMPSOFTWAREVERSIONS                   } from '../modules/nf-core/custom/dumpsoftwareversions/main'
+
+/*
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    RUN MAIN WORKFLOW
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+*/
+
+workflow ASCC {
+
+    main:
+    ch_versions = Channel.empty()
+
+    input_ch = Channel.fromPath(params.input, checkIfExists: true)
+
+    //
+    // SUBWORKFLOW: DECODE YAML INTO PARAMETERS FOR PIPELINE
+    //
+    YAML_INPUT (
+        input_ch
+    )
+    ch_versions = ch_versions.mix(YAML_INPUT.out.versions)
+
+    //
+    // MODULE: CALCULATE GC CONTENT PER SCAFFOLD IN INPUT FASTA
+    //
+    GC_CONTENT (
+        YAML_INPUT.out.reference_tuple
+    )
+    ch_versions = ch_versions.mix(GC_CONTENT.out.versions)
+
+    //
+    // SUBWORKFLOW: GENERATE GENOME FILE
+    //
+    GENERATE_GENOME (
+        YAML_INPUT.out.reference_tuple,
+        YAML_INPUT.out.pacbio_barcodes
+    )
+    ch_versions = ch_versions.mix(GENERATE_GENOME.out.versions)
+
+    //
+    // SUBWORKFLOW: EXTRACT RESULTS HITS FROM TIARA
+    //
+    EXTRACT_TIARA_HITS (
+        GENERATE_GENOME.out.reference_tuple
+    )
+    ch_versions = ch_versions.mix(EXTRACT_TIARA_HITS.out.versions)
+
+    //
+    // LOGIC: INJECT SLIDING WINDOW VALUES INTO REFERENCE
+    //
+    YAML_INPUT.out.reference_tuple
+        .combine ( YAML_INPUT.out.seqkit_sliding.toInteger() )
+        .combine ( YAML_INPUT.out.seqkit_window.toInteger() )
+        .map { meta, ref, sliding, window ->
+            tuple([ id      : meta.id,
+                    sliding : sliding,
+                    window  : window
+                ],
+                file(ref)
+            )}
+        .set { modified_input }
+
+    //
+    // SUBWORKFLOW: EXTRACT RESULTS HITS FROM NT-BLAST
+    //
+/*     EXTRACT_NT_BLAST (
+        modified_input,
+        YAML_INPUT.out.nt_database,
+        YAML_INPUT.out.ncbi_accessions,
+        YAML_INPUT.out.ncbi_rankedlineage_path
+    )
+    ch_versions = ch_versions.mix(EXTRACT_NT_BLAST.out.versions) */
+
+    //
+    // LOGIC: CHECK WHETHER THERE IS A MITO AND BRANCH
+    //
+    YAML_INPUT.out.mito_tuple
+        .branch { meta, check ->
+            valid:      check != "NO MITO"
+            invalid:    check == "NO MITO"
+        }
+        .set { mito_check }
+
+    //
+    // SUBWORKFLOW: BLASTING FOR MITO ASSEMBLIES IN GENOME
+    //
+    MITO_ORGANELLAR_BLAST (
+        YAML_INPUT.out.reference_tuple,
+        YAML_INPUT.out.mito_var,
+        mito_check.valid
+    )
+    ch_versions = ch_versions.mix(MITO_ORGANELLAR_BLAST.out.versions)
+
+    //
+    // LOGIC: CHECK WHETHER THERE IS A PLASTID AND BRANCH
+    //
+    YAML_INPUT.out.plastid_tuple
+        .branch { meta, check ->
+            valid:      check != "NO PLASTID"
+            invalid:    check == "NO PLASTID"
+        }
+        .set { plastid_check }
+
+    //
+    // SUBWORKFLOW: BLASTING FOR PLASTID ASSEMBLIES IN GENOME
+    //
+    PLASTID_ORGANELLAR_BLAST (
+        YAML_INPUT.out.reference_tuple,
+        YAML_INPUT.out.plastid_var,
+        plastid_check.valid
+    )
+    ch_versions = ch_versions.mix(PLASTID_ORGANELLAR_BLAST.out.versions)
+
+    //
+    // SUBWORKFLOW:
+    //
+    RUN_FCSADAPTOR (
+        YAML_INPUT.out.reference_tuple
+    )
+    ch_versions = ch_versions.mix(RUN_FCSADAPTOR.out.versions)
+
+    //
+    // SUBWORKFLOW:
+    //
+    RUN_FCSGX (
+        YAML_INPUT.out.reference_tuple,
+        YAML_INPUT.out.fcs_gx_database_path,
+        YAML_INPUT.out.taxid,
+        YAML_INPUT.out.ncbi_rankedlineage_path
+    )
+    ch_versions = ch_versions.mix(RUN_FCSADAPTOR.out.versions)
+
+    //
+    // SUBWORKFLOW: IDENTITY PACBIO BARCODES IN INPUT DATA
+    //
+    /*PACBIO_BARCODE_CHECK (
+        YAML_INPUT.out.reference_tuple,
+        YAML_INPUT.out.pacbio_tuple,
+        YAML_INPUT.out.pacbio_barcodes,
+        YAML_INPUT.out.pacbio_multiplex_codes
+    )
+    ch_versions = ch_versions.mix(PACBIO_BARCODE_CHECK.out.versions)*/
+
+    //
+    // SUBWORKFLOW: CALCULATE AVERAGE READ COVERAGE
+    //
+    RUN_READ_COVERAGE (
+        YAML_INPUT.out.reference_tuple,
+        YAML_INPUT.out.assembly_path,
+        YAML_INPUT.out.pacbio_tuple,
+        YAML_INPUT.out.reads_type
+    )
+    ch_versions = ch_versions.mix(RUN_READ_COVERAGE.out.versions)
+
+    //
+    // SUBWORKFLOW: COLLECT SOFTWARE VERSIONS
+    //
+    CUSTOM_DUMPSOFTWAREVERSIONS (
+        ch_versions.unique().collectFile(name: 'collated_versions.yml')
+    )
+
+    emit:
+    software_ch = CUSTOM_DUMPSOFTWAREVERSIONS.out.yml
+    versions_ch = CUSTOM_DUMPSOFTWAREVERSIONS.out.versions
+}
+
+/*
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    COMPLETION EMAIL AND SUMMARY
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+*/
+
+workflow.onComplete {
+    if (params.email || params.email_on_fail) {
+        NfcoreTemplate.email(workflow, params, summary_params, projectDir, log)
+    }
+    NfcoreTemplate.summary(workflow, params, log)
+    if (params.hook_url) {
+        NfcoreTemplate.IM_notification(workflow, params, summary_params, projectDir, log)
+    }
+    // TreeValProject.summary(workflow, reference_tuple, summary_params, projectDir)
+
+}
+
+/*
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    THE END
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+*/

From 87cffd9dcb684e79cfe656f1142c6f956dc497c8 Mon Sep 17 00:00:00 2001
From: eeaunin <eeaunin@gmail.com>
Date: Thu, 16 Nov 2023 17:15:25 +0000
Subject: [PATCH 03/23] Edited the main workflow

---
 workflows/ascc.nf | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/workflows/ascc.nf b/workflows/ascc.nf
index 8a98914..1f596ed 100644
--- a/workflows/ascc.nf
+++ b/workflows/ascc.nf
@@ -80,10 +80,12 @@ workflow ASCC {
     //
     // MODULE: CALCULATE GC CONTENT PER SCAFFOLD IN INPUT FASTA
     //
+    /*
     GC_CONTENT (
         YAML_INPUT.out.reference_tuple
     )
     ch_versions = ch_versions.mix(GC_CONTENT.out.versions)
+    */
 
     //
     // SUBWORKFLOW: GENERATE GENOME FILE

From f045b698c223dce8b0af5893fa8b7d9977e9f320 Mon Sep 17 00:00:00 2001
From: eeaunin <eeaunin@gmail.com>
Date: Thu, 16 Nov 2023 17:28:06 +0000
Subject: [PATCH 04/23] Edited the test yaml file

---
 assets/test.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/assets/test.yaml b/assets/test.yaml
index d418a26..5e1e398 100755
--- a/assets/test.yaml
+++ b/assets/test.yaml
@@ -1,4 +1,4 @@
-assembly_path: /lustre/scratch124/tol/projects/tol/data/insects/Polyommatus_atlantica/assembly/draft/treeval/ilPolAtla1_merged/raw/ref.fa
+assembly_path: /lustre/scratch123/tol/teams/tola/users/ea10/pipeline_testing/20231114_pyoelii_vecscreen/ref/PlasmoDB-58_Pyoeliiyoelii17XNL_Genome_with_adapters2.fasta
 assembly_title: asccTinyTest
 reads_path: /lustre/scratch123/tol/resources/treeval/treeval-testdata/asccTinyTest/pacbio/
 reads_type: "hifi"

From dcfb977fe5f7c29826e31410d8f8de66c35b7ac3 Mon Sep 17 00:00:00 2001
From: eeaunin <eeaunin@gmail.com>
Date: Thu, 16 Nov 2023 17:31:05 +0000
Subject: [PATCH 05/23] Edited the test yaml file

---
 assets/test.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/assets/test.yaml b/assets/test.yaml
index 5e1e398..cf55c9d 100755
--- a/assets/test.yaml
+++ b/assets/test.yaml
@@ -1,4 +1,4 @@
-assembly_path: /lustre/scratch123/tol/teams/tola/users/ea10/pipeline_testing/20231114_pyoelii_vecscreen/ref/PlasmoDB-58_Pyoeliiyoelii17XNL_Genome_with_adapters2.fasta
+assembly_path: /lustre/scratch123/tol/teams/tola/users/ea10/pipeline_testing/20231114_pyoelii_vecscreen/ref/PlasmoDB-58_Pyoeliiyoelii17XNL_Genome_with_adapters2_fh.fasta
 assembly_title: asccTinyTest
 reads_path: /lustre/scratch123/tol/resources/treeval/treeval-testdata/asccTinyTest/pacbio/
 reads_type: "hifi"

From 320e0716cd0521bf5f7d6b26a94d9e3036540c5b Mon Sep 17 00:00:00 2001
From: eeaunin <eeaunin@gmail.com>
Date: Thu, 16 Nov 2023 17:36:16 +0000
Subject: [PATCH 06/23] Edited the test yaml file

---
 assets/test.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/assets/test.yaml b/assets/test.yaml
index cf55c9d..2f3f9e1 100755
--- a/assets/test.yaml
+++ b/assets/test.yaml
@@ -1,4 +1,4 @@
-assembly_path: /lustre/scratch123/tol/teams/tola/users/ea10/pipeline_testing/20231114_pyoelii_vecscreen/ref/PlasmoDB-58_Pyoeliiyoelii17XNL_Genome_with_adapters2_fh.fasta
+assembly_path: /lustre/scratch123/tol/teams/tola/users/ea10/pipeline_testing/20231114_pyoelii_vecscreen/ref/PlasmoDB-58_Pyoeliiyoelii17XNL_Genome_with_adapters2_fh2.fasta
 assembly_title: asccTinyTest
 reads_path: /lustre/scratch123/tol/resources/treeval/treeval-testdata/asccTinyTest/pacbio/
 reads_type: "hifi"

From ddc2f300dfde6985bafc5fdca8c1653bdf4cc705 Mon Sep 17 00:00:00 2001
From: eeaunin <eeaunin@gmail.com>
Date: Thu, 16 Nov 2023 18:00:31 +0000
Subject: [PATCH 07/23] Deleted trailing Ns scripts and reformatted the
 VecScreen Python scripts with black

---
 bin/VSlistTo1HitPerLine.py          |  18 +++-
 bin/chunk_assembly_for_vecscreen.py |  52 +++++-----
 bin/summarise_vecscreen_output.py   |  13 ++-
 bin/trim_Ns.py                      | 144 ----------------------------
 modules/local/detect_trailing_n.nf  |  42 --------
 5 files changed, 51 insertions(+), 218 deletions(-)
 delete mode 100755 bin/trim_Ns.py
 delete mode 100644 modules/local/detect_trailing_n.nf

diff --git a/bin/VSlistTo1HitPerLine.py b/bin/VSlistTo1HitPerLine.py
index 55b8ca7..87ea39d 100755
--- a/bin/VSlistTo1HitPerLine.py
+++ b/bin/VSlistTo1HitPerLine.py
@@ -16,6 +16,7 @@
 import re
 import argparse
 
+
 def main(args):
     hits_to_report = ""
     ID = ""
@@ -27,14 +28,14 @@ def main(args):
             line = line.strip()
             Fld = line.split(" ")
 
-            if not re.match(r'^[0-9 \t]+$', line):
+            if not re.match(r"^[0-9 \t]+$", line):
                 hits_to_report = ""
 
             if hits_to_report:
                 print(f"VecScreen_{hits_to_report.ljust(8)}\t{ID}\t{line}")
                 hits += 1
                 continue
-            
+
             if line.startswith(">Vector "):
                 if ID != "":
                     if error_found:
@@ -75,13 +76,20 @@ def main(args):
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="Reformatting VecScreen's output")
-    parser.add_argument("vecscreen_output_file", type=str, help="Path to a raw output file from NCBI VecScreen (from a run with the -f3 flag)", default=None)
+    parser.add_argument(
+        "vecscreen_output_file",
+        type=str,
+        help="Path to a raw output file from NCBI VecScreen (from a run with the -f3 flag)",
+        default=None,
+    )
     parser.add_argument("--skip_reporting_strong_hits", action="store_true", help="Skip reporting strong hits")
     parser.add_argument("--skip_reporting_moderate_hits", action="store_true", help="Skip reporting moderate hits")
     parser.add_argument("--skip_reporting_weak_hits", action="store_true", help="Skip reporting weak hits")
-    parser.add_argument("--skip_reporting_suspect_hits", action="store_true", help="Skip reporting hits of suspect origin")
+    parser.add_argument(
+        "--skip_reporting_suspect_hits", action="store_true", help="Skip reporting hits of suspect origin"
+    )
     parser.add_argument("--skip_reporting_no_hits", action="store_true", help="Skip reporting no-hits")
     parser.add_argument("--skip_reporting_errors", action="store_true", help="Skip reporting errors")
     parser.add_argument("-v", "--version", action="version", version="1.0")
     args = parser.parse_args()
-    main(args)
\ No newline at end of file
+    main(args)
diff --git a/bin/chunk_assembly_for_vecscreen.py b/bin/chunk_assembly_for_vecscreen.py
index 5721745..321777c 100755
--- a/bin/chunk_assembly_for_vecscreen.py
+++ b/bin/chunk_assembly_for_vecscreen.py
@@ -7,42 +7,44 @@
 import argparse
 import os
 
-def main(fasta_input_file, fasta_output_file):
-	fasta_input_file = os.path.abspath(fasta_input_file)
-	fasta_output_file = os.path.abspath(fasta_output_file)
 
-	threshold_length = 500000
-	overlap_length = int( threshold_length / 10 )
-	minimum_record_size = 11
+def main(fasta_input_file, fasta_output_file):
+    fasta_input_file = os.path.abspath(fasta_input_file)
+    fasta_output_file = os.path.abspath(fasta_output_file)
 
-	fasta_output_handle = open(fasta_output_file, 'w')
+    threshold_length = 500000
+    overlap_length = int(threshold_length / 10)
+    minimum_record_size = 11
 
-	with open(fasta_input_file, 'r') as fasta_input_handle:
-		for record in SeqIO.parse(fasta_input_handle, "fasta"):
+    fasta_output_handle = open(fasta_output_file, "w")
 
-			if len(record) >= minimum_record_size:
-				records_to_write = []
+    with open(fasta_input_file, "r") as fasta_input_handle:
+        for record in SeqIO.parse(fasta_input_handle, "fasta"):
+            if len(record) >= minimum_record_size:
+                records_to_write = []
 
-				slice_count = 0
-				while (slice_count * threshold_length) < len(record) - (threshold_length+overlap_length):
-					record_slice = record[(slice_count*threshold_length):((slice_count+1)*threshold_length + overlap_length)]
-					record_slice.id += '.chunk_' + str(slice_count+1)
+                slice_count = 0
+                while (slice_count * threshold_length) < len(record) - (threshold_length + overlap_length):
+                    record_slice = record[
+                        (slice_count * threshold_length) : ((slice_count + 1) * threshold_length + overlap_length)
+                    ]
+                    record_slice.id += ".chunk_" + str(slice_count + 1)
 
-					record_slice.description = ''
-					records_to_write.append(record_slice)
-					slice_count += 1
+                    record_slice.description = ""
+                    records_to_write.append(record_slice)
+                    slice_count += 1
 
-				final_record_slice = record[(slice_count*threshold_length):]
+                final_record_slice = record[(slice_count * threshold_length) :]
 
-				if slice_count > 0:
-					final_record_slice.id += '.chunk_' + str(slice_count+1)
-				final_record_slice.description = ''
+                if slice_count > 0:
+                    final_record_slice.id += ".chunk_" + str(slice_count + 1)
+                final_record_slice.description = ""
 
-				records_to_write.append(final_record_slice)
+                records_to_write.append(final_record_slice)
 
-				SeqIO.write(records_to_write, fasta_output_handle, 'fasta')
+                SeqIO.write(records_to_write, fasta_output_handle, "fasta")
 
-	fasta_output_handle.close()
+    fasta_output_handle.close()
 
 
 if __name__ == "__main__":
diff --git a/bin/summarise_vecscreen_output.py b/bin/summarise_vecscreen_output.py
index a513f53..50cb173 100755
--- a/bin/summarise_vecscreen_output.py
+++ b/bin/summarise_vecscreen_output.py
@@ -46,8 +46,17 @@ def main(vecscreen_file, chunk_size):
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description=__doc__)
-    parser.add_argument("vecscreen_file", type=str, help="Path to output file of VecScreen (run with -f3 flag), filtered with VSlistTo1HitPerLine.awk")
-    parser.add_argument("--chunk_size", type=int, help="Chunk size of the chunked FASTA file that VecScreen was run with, in bp. Default: 500000", default=50000)
+    parser.add_argument(
+        "vecscreen_file",
+        type=str,
+        help="Path to output file of VecScreen (run with -f3 flag), filtered with VSlistTo1HitPerLine.awk",
+    )
+    parser.add_argument(
+        "--chunk_size",
+        type=int,
+        help="Chunk size of the chunked FASTA file that VecScreen was run with, in bp. Default: 500000",
+        default=50000,
+    )
     parser.add_argument("-v", "--version", action="version", version="1.0")
     args = parser.parse_args()
     main(args.vecscreen_file, args.chunk_size)
diff --git a/bin/trim_Ns.py b/bin/trim_Ns.py
deleted file mode 100755
index 3f20eb6..0000000
--- a/bin/trim_Ns.py
+++ /dev/null
@@ -1,144 +0,0 @@
-#!/usr/bin/env python3
-"""
-Script for detecting trailing Ns that should be trimmed from an assembly, from James Torrance. Edited by Eerik Aunin
-"""
-
-import re
-from Bio import SeqIO
-import argparse
-
-def main(fasta_file, output_file):
-
-	minleftover = 200   # after trimming start/end, at least this many bp should be left
-	winsize     = 5000  # for sliding window analysis
-	minslidingBase = 0.4   # maximum fraction of Ns in sliding window before alarm sets off
-
-	output_handle = open(output_file, 'w')
-	with open(fasta_file, "r") as fasta_input_handle:
-		for record in SeqIO.parse(fasta_input_handle, "fasta"):
-			# output_handle.write('Testing ' + record.id + "\n")
-
-			# Do this twice: once with the regular string, once with the reversed string
-			# The output should be as one or more variables that can be reversed for the second iteration
-			seq_string = str(record.seq)
-
-			n_count = seq_string.count('N') + seq_string.count('n')
-			n_perc = n_count/ len(seq_string)
-			if n_perc > 0.8:
-				output_handle.write('# WARNING: ' + record.id + '\t' + str(int(n_perc * 10000)/100) + ' % Ns of total ' + str(len(seq_string)) + "\n")
-
-			# Consider handling this via two separate regexps for start and end. Adding character-class might be escalating run-time
-			#terminal_n_match = re.match('^([Nn]*)(\S+?)([Nn]*)$', seq_string)
-			start_n_match = re.match('^([Nn]*)', seq_string)
-			end_n_match = re.search('^([Nn]*)', seq_string[::-1])
-
-			startseq = ''
-			if start_n_match:
-				startseq = start_n_match.group(1)
-			endseq = ''
-			if end_n_match:
-				endseq = end_n_match.group(1)
-			realseq_length = len(seq_string) - len(startseq) - len(endseq)
-			# Handle "all Ns exception"
-			if len(startseq) == len(seq_string):
-				realseq_length = 0
-				endseq = '' 
-
-			if len(startseq) > 0 or len(endseq) > 0:
-				if len(startseq) > 0:
-					output_handle.write('\t'.join(["TRIM:",record.id,'1',str(len(startseq))]) + "\t\n")
-				if len(endseq) > 0:
-					output_handle.write('\t'.join(["TRIM:",record.id,str(len(startseq)+realseq_length+1),str(len(seq_string))]) + "\t\n")
-
-			if realseq_length <= minleftover:
-				output_handle.write("REMOVE: " + record.id + '\t' + str(realseq_length) + " bp leftover after trimming\n")
-			
-
-			# Only attempt the windowing approach if there's an initial window that doesn't meet the trigger condition
-			for seq_string_loop, seq_string_for_window in enumerate([seq_string, seq_string[::-1]]):
-				if len(seq_string_for_window) > winsize and (seq_string_for_window[:winsize].count('N') + seq_string_for_window[:winsize].count('n')) > winsize * minslidingBase:
-
-					non_n_regions = []
-					non_n_iterator = re.finditer('[^Nn]+', seq_string_for_window)
-					for non_n_instance in non_n_iterator:
-						# output_handle.write(record.id + '\t' + str(non_n_instance.start(0)) + '\t' + str(non_n_instance.end(0)) + '\n')
-
-						current_non_n_start = non_n_instance.start(0) + 1
-						current_non_n_end = non_n_instance.end(0)
-
-						non_n_regions.insert(0, [current_non_n_start, current_non_n_end])
-
-						# Does the *end* of this block satisfy the window condition?
-						bases_in_window = 0
-						start_of_first_base_block_in_window = None
-						for non_n_region in non_n_regions:
-							if non_n_region[1] >= current_non_n_end - winsize:
-								start_of_first_base_block_in_window = non_n_region[0]
-								if non_n_region[0] >= current_non_n_end - winsize:
-									bases_in_window += non_n_region[1] - non_n_region[0] + 1
-								else:
-									bases_in_window += non_n_region[1] - non_n_region[0] + 1
-									break
-							else:
-								break
-
-						if bases_in_window >= minslidingBase * winsize:
-							# Because start positions have BED-style
-
-							# Remember that the blocks are in *reverse* order along the sequence
-							trackback_to_start_flag = False
-
-							for i, non_n_region in enumerate(non_n_regions):
-								if i == 0:
-									continue
-								bases_in_window_2 = 0
-								if non_n_region[1] < non_n_regions[0][0] - winsize:
-									break
-								else:
-									current_window_start = max(non_n_region[0], non_n_regions[0][0] - winsize)
-
-									# Count the bases from this block
-									bases_in_window_2 += min(non_n_region[1], (current_window_start+winsize-1) ) - current_window_start + 1
-
-									# Add up all the sequence in blocks tested thus far, but not the final block
-									for j in range(1, i):
-										bases_in_window_2 += non_n_regions[j][1] - non_n_regions[j][0] + 1
-
-									# Count the sequence in the final block that can contribute
-									# This is the region between the start of the final block and either the end of the block
-									# or the end of a window extending from the current test start point, whichever comes first
-									bases_in_window_2 += min( non_n_regions[0][1], (current_window_start + winsize-1) ) - non_n_regions[0][0] + 1
-
-									# output_handle.write('BIW: ' + str(i) + ' ' + str(bases_in_window_2) + "\n")
-									if bases_in_window_2 >= minslidingBase * winsize:
-										if current_window_start == non_n_region[0]:
-											start_of_first_base_block_in_window = current_window_start
-										else:
-											start_of_first_base_block_in_window = non_n_regions[i-1][0]
-									else:
-										# We keep going back until the threshold condition is *not* met
-										break
-
-									# If we find the break-point should be before the first block, then we don't want to trim at all!										
-									if i == len(non_n_regions) - 1:
-										trackback_to_start_flag = True
-
-
-							# Only trim if the breakpoint isn't before the first block
-							# and if the breakpoint isn't at the start of the sequence
-							if not(trackback_to_start_flag) and start_of_first_base_block_in_window != 1:
-								if seq_string_loop == 0:
-									output_handle.write("FWDCLIP:\t" + record.id + "\t1\t" + str(start_of_first_base_block_in_window - 1) + "\n")
-								else:
-									output_handle.write("REVCLIP:\t" + record.id + '\t' + str(len(seq_string) - start_of_first_base_block_in_window + 2) + '\t' + str(len(seq_string)) + "\n")
-
-							break
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description=__doc__)
-    parser.add_argument("fasta_file", type=str, help="Path to input FASTA file")
-    parser.add_argument("output_file", type=str, help="Path for output report file")
-    parser.add_argument("-v", "--version", action="version", version="1.0")
-    args = parser.parse_args()
-    main(args.fasta_file, args.output_file)
\ No newline at end of file
diff --git a/modules/local/detect_trailing_n.nf b/modules/local/detect_trailing_n.nf
deleted file mode 100644
index 0ddf966..0000000
--- a/modules/local/detect_trailing_n.nf
+++ /dev/null
@@ -1,42 +0,0 @@
-process DETECT_TRAILING_N {
-    tag "$meta.id"
-    label 'process_single'
-
-    conda "bioconda::biopython=1.70"
-    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
-        'https://depot.galaxyproject.org/singularity/biopython:1.70--np112py35_1':
-        'quay.io/biocontainers/biopython:1.70--np112py35_1' }"
-
-    input:
-    tuple val(meta), path(fasta_input_file)
-
-    output:
-    tuple val(meta), path("*_trim_Ns"), emit: trailing_n_report
-    path "versions.yml"           , emit: versions
-
-    when:
-    task.ext.when == null || task.ext.when
-
-    script:
-    def prefix = task.ext.prefix ?: "${meta.id}"
-    """
-    trim_Ns.py ${fasta_input_file} ${prefix}_trim_Ns
-    cat <<-END_VERSIONS > versions.yml
-    "${task.process}":
-        python: \$(python --version | sed 's/Python //g')
-        trim_Ns.py: \$(trim_Ns.py --version | cut -d' ' -f2)
-    END_VERSIONS
-    """
-
-    stub:
-    def prefix = task.ext.prefix ?: "${meta.id}"
-    """
-    touch ${prefix}_trim_Ns
-
-    cat <<-END_VERSIONS > versions.yml
-    "${task.process}":
-        python: \$(python --version | sed 's/Python //g')
-        trim_Ns.py: \$(trim_Ns.py --version | cut -d' ' -f2)
-    END_VERSIONS
-    """
-}
\ No newline at end of file

From f39e4c52e02bace81b97a2bcf19439c46572d9b8 Mon Sep 17 00:00:00 2001
From: eeaunin <eeaunin@gmail.com>
Date: Thu, 16 Nov 2023 20:57:49 +0000
Subject: [PATCH 08/23] Trying a change in github_testing/test.yaml

---
 assets/github_testing/test.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/assets/github_testing/test.yaml b/assets/github_testing/test.yaml
index bb1c09f..57e9d5c 100755
--- a/assets/github_testing/test.yaml
+++ b/assets/github_testing/test.yaml
@@ -20,7 +20,7 @@ busco_lineages_folder: /home/runner/work/ascc/ascc/busco_database/lineages
 fcs_gx_database_path: /home/runner/work/ascc/ascc/FCS_gx/
 diamond_uniprot_database_path: /home/runner/work/ascc/ascc/diamond/UP000000212_1234679_tax.dmnd
 diamond_nr_database_path: /home/runner/work/ascc/ascc/diamond/UP000000212_1234679_tax.dmnd
-vecscreen_database_path: /home/runner/work/ascc/ascc/vecscreen/GCA_900155405.1_PRJEB18760_genomic.fna
+vecscreen_database_path: /home/runner/work/ascc/ascc/kraken2/kraken2
 seqkit:
   sliding: 6000
   window: 100000

From d5d7838862b661db6e2b8e6fbbeabc9d5490e194 Mon Sep 17 00:00:00 2001
From: eeaunin <eeaunin@gmail.com>
Date: Thu, 16 Nov 2023 21:13:56 +0000
Subject: [PATCH 09/23] Trying a change in github_testing/test.yaml

---
 assets/github_testing/test.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/assets/github_testing/test.yaml b/assets/github_testing/test.yaml
index 57e9d5c..e4e7b82 100755
--- a/assets/github_testing/test.yaml
+++ b/assets/github_testing/test.yaml
@@ -20,7 +20,7 @@ busco_lineages_folder: /home/runner/work/ascc/ascc/busco_database/lineages
 fcs_gx_database_path: /home/runner/work/ascc/ascc/FCS_gx/
 diamond_uniprot_database_path: /home/runner/work/ascc/ascc/diamond/UP000000212_1234679_tax.dmnd
 diamond_nr_database_path: /home/runner/work/ascc/ascc/diamond/UP000000212_1234679_tax.dmnd
-vecscreen_database_path: /home/runner/work/ascc/ascc/kraken2/kraken2
+vecscreen_database_path: /home/runner/work/ascc/ascc/NT_database
 seqkit:
   sliding: 6000
   window: 100000

From 234803aa7093a73db4eb2a0be4aa401c6650ca90 Mon Sep 17 00:00:00 2001
From: yumisims <yy5@sanger.ac.uk>
Date: Fri, 17 Nov 2023 14:27:29 +0000
Subject: [PATCH 10/23] add vecscreen db

---
 .github/workflows/ci.yml        | 2 +-
 assets/github_testing/test.yaml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 83d2195..b7c0b09 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -82,7 +82,7 @@ jobs:
       - name: Download the vecscreen test data
         run: |
           mkdir vecscreen
-          wget -c https://tolit.cog.sanger.ac.uk/test-data/resources/vecscreen/GCA_900155405.1_PRJEB18760_genomic.fna -O vecscreen/GCA_900155405.1_PRJEB18760_genomic.fna
+          curl -L https://ftp.ncbi.nlm.nih.gov/blast/db/v4/16SMicrobial_v4.tar.gz | tar -C vecscreen -xzf -
 
       - name: Run pipeline with test data
         # TODO nf-core: You can customise CI pipeline run tests as required
diff --git a/assets/github_testing/test.yaml b/assets/github_testing/test.yaml
index e4e7b82..7b35de1 100755
--- a/assets/github_testing/test.yaml
+++ b/assets/github_testing/test.yaml
@@ -20,7 +20,7 @@ busco_lineages_folder: /home/runner/work/ascc/ascc/busco_database/lineages
 fcs_gx_database_path: /home/runner/work/ascc/ascc/FCS_gx/
 diamond_uniprot_database_path: /home/runner/work/ascc/ascc/diamond/UP000000212_1234679_tax.dmnd
 diamond_nr_database_path: /home/runner/work/ascc/ascc/diamond/UP000000212_1234679_tax.dmnd
-vecscreen_database_path: /home/runner/work/ascc/ascc/NT_database
+vecscreen_database_path: /home/runner/work/ascc/ascc/vecscreen
 seqkit:
   sliding: 6000
   window: 100000

From 1f628989e76150b3a7774add30ca64cfc0e7ef08 Mon Sep 17 00:00:00 2001
From: yumisims <yy5@sanger.ac.uk>
Date: Fri, 17 Nov 2023 15:07:15 +0000
Subject: [PATCH 11/23] putting in blastdb tuple generation

---
 subworkflows/local/run_vecscreen.nf | 20 ++++++++++++++++++--
 workflows/ascc.nf                   |  3 +--
 2 files changed, 19 insertions(+), 4 deletions(-)

diff --git a/subworkflows/local/run_vecscreen.nf b/subworkflows/local/run_vecscreen.nf
index 645fe82..2bcf027 100644
--- a/subworkflows/local/run_vecscreen.nf
+++ b/subworkflows/local/run_vecscreen.nf
@@ -10,19 +10,35 @@ include { SUMMARISE_VECSCREEN_OUTPUT }      from '../../modules/local/summarise_
 workflow RUN_VECSCREEN {
     take:
     reference_tuple         // val(meta), path(fasta)
-    adapters_blastn_database_directory
+    vecscreen_database_path
 
     main:
     ch_versions = Channel.empty()
     // MODULE: CHUNKS THE ASSEMBLY INTO PIECES WITH FIXED LENGTH
     CHUNK_ASSEMBLY_FOR_VECSCREEN(reference_tuple)
     ch_versions = ch_versions.mix(CHUNK_ASSEMBLY_FOR_VECSCREEN.out.versions)
+
+    reference_tuple
+        .combine( vecscreen_database_path )
+        .map { meta, ref, vecscreen_database_path  ->
+            tuple(
+                [   id          : meta.id
+                ],
+                vecscreen_database_path
+            )
+        }
+        .set { vecscreen_database }
+
+
+
     // MODULE: RUNS NCBI VECSCREEN
-    NCBITOOLS_VECSCREEN(CHUNK_ASSEMBLY_FOR_VECSCREEN.out.chunked_assembly, [ [id:'vecscreen_adapters_database'], adapters_blastn_database_directory ])
+    NCBITOOLS_VECSCREEN(CHUNK_ASSEMBLY_FOR_VECSCREEN.out.chunked_assembly, vecscreen_database)
     ch_versions = ch_versions.mix(NCBITOOLS_VECSCREEN.out.versions)
+
     // MODULE: REFORMATS VECSCREEN OUTPUT AND FILTERS IT TO REMOVE NO-HIT RESULTS AND KEEP ONLY HITS
     FILTER_VECSCREEN_RESULTS(NCBITOOLS_VECSCREEN.out.vecscreen_output)
     ch_versions = ch_versions.mix(FILTER_VECSCREEN_RESULTS.out.versions)
+
     // MODULE: CONVERTS COORDINATES IN ASSEMBLY CHUNKS BACK TO COORDINATES IN THE WHOLE ASSEMBLY AND WRITES A REPORT FILE
     SUMMARISE_VECSCREEN_OUTPUT(FILTER_VECSCREEN_RESULTS.out.filtered_vecscreen_outfile)
     ch_versions = ch_versions.mix(SUMMARISE_VECSCREEN_OUTPUT.out.versions)
diff --git a/workflows/ascc.nf b/workflows/ascc.nf
index 1f596ed..aca9975 100644
--- a/workflows/ascc.nf
+++ b/workflows/ascc.nf
@@ -235,8 +235,7 @@ workflow ASCC {
 
     RUN_VECSCREEN (
         GENERATE_GENOME.out.reference_tuple,
-        "/lustre/scratch123/tol/teams/tola/users/ea10/ascc_databases/vecscreen_database"
-        //YAML_INPUT.out.vecscreen_database_path
+        YAML_INPUT.out.vecscreen_database_path
     )
     ch_versions = ch_versions.mix(RUN_VECSCREEN.out.versions)
 

From a1cd1ffbb02e5d9a41c691dd32fedb0490008cc8 Mon Sep 17 00:00:00 2001
From: yumisims <yy5@sanger.ac.uk>
Date: Fri, 17 Nov 2023 15:18:26 +0000
Subject: [PATCH 12/23] refactor chunk fasta

---
 bin/chunk_assembly_for_vecscreen.py | 64 ++++++++++++++++-------------
 1 file changed, 35 insertions(+), 29 deletions(-)

diff --git a/bin/chunk_assembly_for_vecscreen.py b/bin/chunk_assembly_for_vecscreen.py
index 321777c..576064e 100755
--- a/bin/chunk_assembly_for_vecscreen.py
+++ b/bin/chunk_assembly_for_vecscreen.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python3
 """
-Script for chunking an assembly before running NCBI VecScreen. Adapted from a script by James Torrance, edited by Eerik Aunin
+Script for chunking an assembly before running NCBI VecScreen. Adapted from a script by James Torrance.
+The script was further refactored by Eerik Aunin and Yumi Sims
 """
 
 from Bio import SeqIO
@@ -8,6 +9,30 @@
 import os
 
 
+def process_record(record, chunk_num, threshold_length, overlap_length):
+    return (
+        record[
+            chunk_num * threshold_length : ((chunk_num + 1) * threshold_length + overlap_length)
+        ]
+        if chunk_num * threshold_length < len(record) - (threshold_length + overlap_length)
+        else record[chunk_num * threshold_length :]
+    )
+
+
+def generate_records_to_write(record, threshold_length, overlap_length):
+    return [
+        (
+            record_slice.id + f".chunk_{chunk_num + 1}",
+            "",
+            record_slice,
+        )
+        for chunk_num, record_slice in enumerate(
+            process_record(record, i, threshold_length, overlap_length)
+            for i in range((len(record) - 1) // threshold_length + 1)
+        )
+    ]
+
+
 def main(fasta_input_file, fasta_output_file):
     fasta_input_file = os.path.abspath(fasta_input_file)
     fasta_output_file = os.path.abspath(fasta_output_file)
@@ -16,35 +41,15 @@ def main(fasta_input_file, fasta_output_file):
     overlap_length = int(threshold_length / 10)
     minimum_record_size = 11
 
-    fasta_output_handle = open(fasta_output_file, "w")
-
-    with open(fasta_input_file, "r") as fasta_input_handle:
-        for record in SeqIO.parse(fasta_input_handle, "fasta"):
-            if len(record) >= minimum_record_size:
-                records_to_write = []
-
-                slice_count = 0
-                while (slice_count * threshold_length) < len(record) - (threshold_length + overlap_length):
-                    record_slice = record[
-                        (slice_count * threshold_length) : ((slice_count + 1) * threshold_length + overlap_length)
-                    ]
-                    record_slice.id += ".chunk_" + str(slice_count + 1)
+    try:
+        with open(fasta_output_file, "w") as fasta_output_handle:
+            for record in SeqIO.parse(fasta_input_file, "fasta"):
+                if len(record) >= minimum_record_size:
+                    records_to_write = generate_records_to_write(record, threshold_length, overlap_length)
+                    SeqIO.write(records_to_write, fasta_output_handle, "fasta")
 
-                    record_slice.description = ""
-                    records_to_write.append(record_slice)
-                    slice_count += 1
-
-                final_record_slice = record[(slice_count * threshold_length) :]
-
-                if slice_count > 0:
-                    final_record_slice.id += ".chunk_" + str(slice_count + 1)
-                final_record_slice.description = ""
-
-                records_to_write.append(final_record_slice)
-
-                SeqIO.write(records_to_write, fasta_output_handle, "fasta")
-
-    fasta_output_handle.close()
+    except Exception as e:
+        print(f"An error occurred: {e}")
 
 
 if __name__ == "__main__":
@@ -54,3 +59,4 @@ def main(fasta_input_file, fasta_output_file):
     parser.add_argument("-v", "--version", action="version", version="1.0")
     args = parser.parse_args()
     main(args.fasta_input_file, args.fasta_output_file)
+

From 6cc19bc9f79712b85f2a2e0cd64e488174d5662e Mon Sep 17 00:00:00 2001
From: yumisims <yy5@sanger.ac.uk>
Date: Fri, 17 Nov 2023 15:21:02 +0000
Subject: [PATCH 13/23] black

---
 bin/chunk_assembly_for_vecscreen.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/bin/chunk_assembly_for_vecscreen.py b/bin/chunk_assembly_for_vecscreen.py
index 576064e..7f08e54 100755
--- a/bin/chunk_assembly_for_vecscreen.py
+++ b/bin/chunk_assembly_for_vecscreen.py
@@ -11,9 +11,7 @@
 
 def process_record(record, chunk_num, threshold_length, overlap_length):
     return (
-        record[
-            chunk_num * threshold_length : ((chunk_num + 1) * threshold_length + overlap_length)
-        ]
+        record[chunk_num * threshold_length : ((chunk_num + 1) * threshold_length + overlap_length)]
         if chunk_num * threshold_length < len(record) - (threshold_length + overlap_length)
         else record[chunk_num * threshold_length :]
     )
@@ -59,4 +57,3 @@ def main(fasta_input_file, fasta_output_file):
     parser.add_argument("-v", "--version", action="version", version="1.0")
     args = parser.parse_args()
     main(args.fasta_input_file, args.fasta_output_file)
-

From dea8d118fdd808d369a8339346c8fc2dcd81bcff Mon Sep 17 00:00:00 2001
From: yumisims <yy5@sanger.ac.uk>
Date: Fri, 17 Nov 2023 15:25:58 +0000
Subject: [PATCH 14/23] black

---
 bin/chunk_assembly_for_vecscreen.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bin/chunk_assembly_for_vecscreen.py b/bin/chunk_assembly_for_vecscreen.py
index 7f08e54..38bce8d 100755
--- a/bin/chunk_assembly_for_vecscreen.py
+++ b/bin/chunk_assembly_for_vecscreen.py
@@ -20,7 +20,7 @@ def process_record(record, chunk_num, threshold_length, overlap_length):
 def generate_records_to_write(record, threshold_length, overlap_length):
     return [
         (
-            record_slice.id + f".chunk_{chunk_num + 1}",
+            record_slice.id + ".chunk_{}".format(chunk_num + 1),
             "",
             record_slice,
         )

From 8bcae07fcdfa5325d1134d1dfed3f1d78aeaf0fa Mon Sep 17 00:00:00 2001
From: yumisims <yy5@sanger.ac.uk>
Date: Fri, 17 Nov 2023 15:35:22 +0000
Subject: [PATCH 15/23] the default python is 3.5 so change the beloved f
 striiiiing

---
 bin/chunk_assembly_for_vecscreen.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bin/chunk_assembly_for_vecscreen.py b/bin/chunk_assembly_for_vecscreen.py
index 38bce8d..e36d1ed 100755
--- a/bin/chunk_assembly_for_vecscreen.py
+++ b/bin/chunk_assembly_for_vecscreen.py
@@ -47,7 +47,7 @@ def main(fasta_input_file, fasta_output_file):
                     SeqIO.write(records_to_write, fasta_output_handle, "fasta")
 
     except Exception as e:
-        print(f"An error occurred: {e}")
+        print("An error occurred: {}".format(e))
 
 
 if __name__ == "__main__":

From 77a98a7433ce8c32e465fa737d7b8c6e5cb4e70a Mon Sep 17 00:00:00 2001
From: DLBPointon <damonlbp@hotmail.co.uk>
Date: Fri, 17 Nov 2023 16:33:00 +0000
Subject: [PATCH 16/23] Formatting and minor changes to channels

---
 conf/modules.config                           |  3 +
 modules/local/chunk_assembly_for_vecscreen.nf | 10 ++--
 modules/local/filter_vecscreen_results.nf     |  7 ++-
 modules/local/summarise_vecscreen_output.nf   |  2 +
 subworkflows/local/run_vecscreen.nf           | 59 +++++++++----------
 subworkflows/local/yaml_input.nf              | 11 +++-
 workflows/ascc.nf                             | 51 ++++++----------
 7 files changed, 73 insertions(+), 70 deletions(-)

diff --git a/conf/modules.config b/conf/modules.config
index 2febc01..0820f4a 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -99,4 +99,7 @@ process {
         ext.args = { "-f3" }
     }
 
+    withName: FILTER_VECSCREEN_RESULTS {
+        ext.args    =   "--skip_reporting_suspect_hits --skip_reporting_weak_hits --skip_reporting_no_hits"
+    }
 }
diff --git a/modules/local/chunk_assembly_for_vecscreen.nf b/modules/local/chunk_assembly_for_vecscreen.nf
index 727e04b..9969e42 100644
--- a/modules/local/chunk_assembly_for_vecscreen.nf
+++ b/modules/local/chunk_assembly_for_vecscreen.nf
@@ -11,15 +11,16 @@ process CHUNK_ASSEMBLY_FOR_VECSCREEN {
     tuple val(meta), path(fasta_input_file)
 
     output:
-    tuple val(meta), path("chunked_assembly.fa"), emit: chunked_assembly
-    path "versions.yml"           , emit: versions
+    tuple val(meta), path("*_chunked_assembly.fa")  , emit: chunked_assembly
+    path "versions.yml"                             , emit: versions
 
     when:
     task.ext.when == null || task.ext.when
 
     script:
+    def prefix  = args.ext.prefix   ?: "${meta.id}"
     """
-    chunk_assembly_for_vecscreen.py $fasta_input_file chunked_assembly.fa
+    chunk_assembly_for_vecscreen.py $fasta_input_file ${prefix}_chunked_assembly.fa
 
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":
@@ -29,8 +30,9 @@ process CHUNK_ASSEMBLY_FOR_VECSCREEN {
     """
 
     stub:
+    def prefix  = args.ext.prefix   ?: "${meta.id}"
     """
-    touch chunked_assembly.fa
+    touch ${prefix}_chunked_assembly.fa
 
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":
diff --git a/modules/local/filter_vecscreen_results.nf b/modules/local/filter_vecscreen_results.nf
index 38c701e..9cb1b3b 100644
--- a/modules/local/filter_vecscreen_results.nf
+++ b/modules/local/filter_vecscreen_results.nf
@@ -18,8 +18,10 @@ process FILTER_VECSCREEN_RESULTS {
     task.ext.when == null || task.ext.when
 
     script:
+    def prefix      = args.ext.prefix ?: "${meta.id}"
+    def args        = args.ext.args ?: ''
     """
-    VSlistTo1HitPerLine.py --skip_reporting_suspect_hits --skip_reporting_weak_hits --skip_reporting_no_hits ${vecscreen_outfile} > vecscreen.grepped.out
+    VSlistTo1HitPerLine.py ${args} ${vecscreen_outfile} > ${prefix}_vecscreen.grepped.out
 
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":
@@ -29,8 +31,9 @@ process FILTER_VECSCREEN_RESULTS {
     """
 
     stub:
+    def prefix      = args.ext.prefix ?: "${meta.id}"
     """
-    touch vecscreen.grepped.out
+    touch ${prefix}_vecscreen.grepped.out
 
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":
diff --git a/modules/local/summarise_vecscreen_output.nf b/modules/local/summarise_vecscreen_output.nf
index 3154c3e..8268f3c 100644
--- a/modules/local/summarise_vecscreen_output.nf
+++ b/modules/local/summarise_vecscreen_output.nf
@@ -21,6 +21,7 @@ process SUMMARISE_VECSCREEN_OUTPUT {
     def prefix = task.ext.prefix ?: "${meta.id}"
     """
     summarise_vecscreen_output.py ${vecscreen_filtered_outfile} > ${prefix}.vecscreen_contamination
+
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":
         python: \$(python --version | sed 's/Python //g')
@@ -32,6 +33,7 @@ process SUMMARISE_VECSCREEN_OUTPUT {
     def prefix = task.ext.prefix ?: "${meta.id}"
     """
     touch ${prefix}.vecscreen_contamination
+
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":
         python: \$(python --version | sed 's/Python //g')
diff --git a/subworkflows/local/run_vecscreen.nf b/subworkflows/local/run_vecscreen.nf
index 2bcf027..27a38b1 100644
--- a/subworkflows/local/run_vecscreen.nf
+++ b/subworkflows/local/run_vecscreen.nf
@@ -1,50 +1,49 @@
 #!/usr/bin/env nextflow
 // MODULE IMPORT BLOCK
 
-include { CHUNK_ASSEMBLY_FOR_VECSCREEN }    from '../../modules/local/chunk_assembly_for_vecscreen'
-include { NCBITOOLS_VECSCREEN }             from '../../modules/nf-core/ncbitools/vecscreen/main'
-include { FILTER_VECSCREEN_RESULTS }        from '../../modules/local/filter_vecscreen_results'
-include { SUMMARISE_VECSCREEN_OUTPUT }      from '../../modules/local/summarise_vecscreen_output'
+include { CHUNK_ASSEMBLY_FOR_VECSCREEN  }   from '../../modules/local/chunk_assembly_for_vecscreen'
+include { NCBITOOLS_VECSCREEN           }   from '../../modules/nf-core/ncbitools/vecscreen/main'
+include { FILTER_VECSCREEN_RESULTS      }   from '../../modules/local/filter_vecscreen_results'
+include { SUMMARISE_VECSCREEN_OUTPUT    }   from '../../modules/local/summarise_vecscreen_output'
 
 
 workflow RUN_VECSCREEN {
     take:
-    reference_tuple         // val(meta), path(fasta)
-    vecscreen_database_path
+    reference_tuple             // val(meta), path(fasta)
+    vecscreen_database_tuple    // val(meta), val(db_path)
 
     main:
-    ch_versions = Channel.empty()
-    // MODULE: CHUNKS THE ASSEMBLY INTO PIECES WITH FIXED LENGTH
-    CHUNK_ASSEMBLY_FOR_VECSCREEN(reference_tuple)
-    ch_versions = ch_versions.mix(CHUNK_ASSEMBLY_FOR_VECSCREEN.out.versions)
-
-    reference_tuple
-        .combine( vecscreen_database_path )
-        .map { meta, ref, vecscreen_database_path  ->
-            tuple(
-                [   id          : meta.id
-                ],
-                vecscreen_database_path
-            )
-        }
-        .set { vecscreen_database }
-
+    ch_versions                 = Channel.empty()
 
+    //
+    // MODULE: CHUNKS THE ASSEMBLY INTO PIECES WITH FIXED LENGTH
+    //
+    CHUNK_ASSEMBLY_FOR_VECSCREEN( reference_tuple )
+    ch_versions                 = ch_versions.mix( CHUNK_ASSEMBLY_FOR_VECSCREEN.out.versions )
 
+    //
     // MODULE: RUNS NCBI VECSCREEN
-    NCBITOOLS_VECSCREEN(CHUNK_ASSEMBLY_FOR_VECSCREEN.out.chunked_assembly, vecscreen_database)
-    ch_versions = ch_versions.mix(NCBITOOLS_VECSCREEN.out.versions)
-
+    //
+    NCBITOOLS_VECSCREEN(
+        CHUNK_ASSEMBLY_FOR_VECSCREEN.out.chunked_assembly,
+        vecscreen_database_tuple
+    )
+    ch_versions                 = ch_versions.mix( NCBITOOLS_VECSCREEN.out.versions )
+
+    //
     // MODULE: REFORMATS VECSCREEN OUTPUT AND FILTERS IT TO REMOVE NO-HIT RESULTS AND KEEP ONLY HITS
-    FILTER_VECSCREEN_RESULTS(NCBITOOLS_VECSCREEN.out.vecscreen_output)
-    ch_versions = ch_versions.mix(FILTER_VECSCREEN_RESULTS.out.versions)
+    //
+    FILTER_VECSCREEN_RESULTS( NCBITOOLS_VECSCREEN.out.vecscreen_output )
+    ch_versions                 = ch_versions.mix( FILTER_VECSCREEN_RESULTS.out.versions )
 
+    //
     // MODULE: CONVERTS COORDINATES IN ASSEMBLY CHUNKS BACK TO COORDINATES IN THE WHOLE ASSEMBLY AND WRITES A REPORT FILE
-    SUMMARISE_VECSCREEN_OUTPUT(FILTER_VECSCREEN_RESULTS.out.filtered_vecscreen_outfile)
-    ch_versions = ch_versions.mix(SUMMARISE_VECSCREEN_OUTPUT.out.versions)
+    //
+    SUMMARISE_VECSCREEN_OUTPUT( FILTER_VECSCREEN_RESULTS.out.filtered_vecscreen_outfile )
+    ch_versions                 = ch_versions.mix( SUMMARISE_VECSCREEN_OUTPUT.out.versions )
 
     emit:
     vecscreen_contamination     = SUMMARISE_VECSCREEN_OUTPUT.out.vecscreen_contamination
-    versions                    = ch_versions.ifEmpty(null) // channel: [ versions.yml ]
+    versions                    = ch_versions.ifEmpty( null ) // channel: [ versions.yml ]
 }
 
diff --git a/subworkflows/local/yaml_input.nf b/subworkflows/local/yaml_input.nf
index 3ac41ec..964f29d 100644
--- a/subworkflows/local/yaml_input.nf
+++ b/subworkflows/local/yaml_input.nf
@@ -97,6 +97,15 @@ workflow YAML_INPUT {
         }
         .set{ ch_plastid }
 
+    group.assembly_title
+        .combine ( group.vecscreen_database_path )
+        .map{ assembly_id, db_path ->
+            tuple(  [    id: assembly_id ],
+                    db_path
+            )
+        }
+        .set{ ch_vecscreen }
+
     emit:
     reference_tuple                  = ch_reference
     pacbio_tuple                     = ch_pacbio
@@ -115,7 +124,7 @@ workflow YAML_INPUT {
     fcs_gx_database_path             = group.fcs_gx_database_path
     diamond_uniprot_database_path    = group.diamond_uniprot_database_path
     diamond_nr_database_path         = group.diamond_nr_database_path
-    vecscreen_database_path          = group.vecscreen_database_path
+    vecscreen_database_path          = ch_vecscreen
     seqkit_sliding                   = seqkit.sliding_value
     seqkit_window                    = seqkit.window_value
     mito_tuple                       = ch_mito
diff --git a/workflows/ascc.nf b/workflows/ascc.nf
index aca9975..c61f2b7 100644
--- a/workflows/ascc.nf
+++ b/workflows/ascc.nf
@@ -24,22 +24,19 @@ WorkflowAscc.initialise(params, log)
 //
 // SUBWORKFLOW: Consisting of a mix of local and nf-core/modules
 //
-
-include { YAML_INPUT                    } from '../subworkflows/local/yaml_input'
-include { GENERATE_GENOME               } from '../subworkflows/local/generate_genome'
-include { EXTRACT_TIARA_HITS            } from '../subworkflows/local/extract_tiara_hits'
-include { EXTRACT_NT_BLAST              } from '../subworkflows/local/extract_nt_blast'
-include { RUN_FCSADAPTOR                } from '../subworkflows/local/run_fcsadaptor'
-include { RUN_NT_KRAKEN                 } from '../subworkflows/local/run_nt_kraken'
-include { RUN_FCSGX                     } from '../subworkflows/local/run_fcsgx'
-include { PACBIO_BARCODE_CHECK          } from '../subworkflows/local/pacbio_barcode_check'
-include { RUN_READ_COVERAGE             } from '../subworkflows/local/run_read_coverage'
-include { RUN_VECSCREEN                 } from '../subworkflows/local/run_vecscreen'
+include { YAML_INPUT                                    } from '../subworkflows/local/yaml_input'
+include { GENERATE_GENOME                               } from '../subworkflows/local/generate_genome'
+include { EXTRACT_TIARA_HITS                            } from '../subworkflows/local/extract_tiara_hits'
+include { EXTRACT_NT_BLAST                              } from '../subworkflows/local/extract_nt_blast'
+include { RUN_FCSADAPTOR                                } from '../subworkflows/local/run_fcsadaptor'
+include { RUN_NT_KRAKEN                                 } from '../subworkflows/local/run_nt_kraken'
+include { RUN_FCSGX                                     } from '../subworkflows/local/run_fcsgx'
+include { PACBIO_BARCODE_CHECK                          } from '../subworkflows/local/pacbio_barcode_check'
+include { RUN_READ_COVERAGE                             } from '../subworkflows/local/run_read_coverage'
+include { RUN_VECSCREEN                                 } from '../subworkflows/local/run_vecscreen'
 include { ORGANELLAR_BLAST as PLASTID_ORGANELLAR_BLAST  } from '../subworkflows/local/organellar_blast'
 include { ORGANELLAR_BLAST as MITO_ORGANELLAR_BLAST     } from '../subworkflows/local/organellar_blast'
 
-
-
 //
 // MODULE: Local modules
 //
@@ -125,77 +122,66 @@ workflow ASCC {
     //
     // SUBWORKFLOW: EXTRACT RESULTS HITS FROM NT-BLAST
     //
-/*     EXTRACT_NT_BLAST (
+    EXTRACT_NT_BLAST (
         modified_input,
         YAML_INPUT.out.nt_database,
         YAML_INPUT.out.ncbi_accessions,
         YAML_INPUT.out.ncbi_rankedlineage_path
     )
-    ch_versions = ch_versions.mix(EXTRACT_NT_BLAST.out.versions) */
+    ch_versions = ch_versions.mix(EXTRACT_NT_BLAST.out.versions)
 
     //
     // LOGIC: CHECK WHETHER THERE IS A MITO AND BRANCH
     //
-
-    /*
     YAML_INPUT.out.mito_tuple
         .branch { meta, check ->
             valid:      check != "NO MITO"
             invalid:    check == "NO MITO"
         }
         .set { mito_check }
-    */
+
 
     //
     // SUBWORKFLOW: BLASTING FOR MITO ASSEMBLIES IN GENOME
     //
-    /*
     MITO_ORGANELLAR_BLAST (
         YAML_INPUT.out.reference_tuple,
         YAML_INPUT.out.mito_var,
         mito_check.valid
     )
     ch_versions = ch_versions.mix(MITO_ORGANELLAR_BLAST.out.versions)
-    */
 
     //
     // LOGIC: CHECK WHETHER THERE IS A PLASTID AND BRANCH
     //
-    /*
     YAML_INPUT.out.plastid_tuple
         .branch { meta, check ->
             valid:      check != "NO PLASTID"
             invalid:    check == "NO PLASTID"
         }
         .set { plastid_check }
-    */
 
     //
     // SUBWORKFLOW: BLASTING FOR PLASTID ASSEMBLIES IN GENOME
     //
-    /*
     PLASTID_ORGANELLAR_BLAST (
         YAML_INPUT.out.reference_tuple,
         YAML_INPUT.out.plastid_var,
         plastid_check.valid
     )
     ch_versions = ch_versions.mix(PLASTID_ORGANELLAR_BLAST.out.versions)
-    */
 
     //
     // SUBWORKFLOW:
     //
-    /*
     RUN_FCSADAPTOR (
         YAML_INPUT.out.reference_tuple
     )
     ch_versions = ch_versions.mix(RUN_FCSADAPTOR.out.versions)
-    */
 
     //
     // SUBWORKFLOW:
     //
-    /*
     RUN_FCSGX (
         YAML_INPUT.out.reference_tuple,
         YAML_INPUT.out.fcs_gx_database_path,
@@ -203,23 +189,21 @@ workflow ASCC {
         YAML_INPUT.out.ncbi_rankedlineage_path
     )
     ch_versions = ch_versions.mix(RUN_FCSADAPTOR.out.versions)
-    */
 
     //
     // SUBWORKFLOW: IDENTITY PACBIO BARCODES IN INPUT DATA
     //
-    /*PACBIO_BARCODE_CHECK (
+    PACBIO_BARCODE_CHECK (
         YAML_INPUT.out.reference_tuple,
         YAML_INPUT.out.pacbio_tuple,
         YAML_INPUT.out.pacbio_barcodes,
         YAML_INPUT.out.pacbio_multiplex_codes
     )
-    ch_versions = ch_versions.mix(PACBIO_BARCODE_CHECK.out.versions)*/
+    ch_versions = ch_versions.mix(PACBIO_BARCODE_CHECK.out.versions)
 
     //
     // SUBWORKFLOW: CALCULATE AVERAGE READ COVERAGE
     //
-    /*
     RUN_READ_COVERAGE (
         YAML_INPUT.out.reference_tuple,
         YAML_INPUT.out.assembly_path,
@@ -227,18 +211,19 @@ workflow ASCC {
         YAML_INPUT.out.reads_type
     )
     ch_versions = ch_versions.mix(RUN_READ_COVERAGE.out.versions)
-    */
 
     //
     // SUBWORKFLOW: COLLECT SOFTWARE VERSIONS
     //
-
     RUN_VECSCREEN (
         GENERATE_GENOME.out.reference_tuple,
         YAML_INPUT.out.vecscreen_database_path
     )
     ch_versions = ch_versions.mix(RUN_VECSCREEN.out.versions)
 
+    //
+    // SUBWORKFLOW: Collates version data from prior subworflows
+    //
     CUSTOM_DUMPSOFTWAREVERSIONS (
         ch_versions.unique().collectFile(name: 'collated_versions.yml')
     )

From 7317fd7c9854db1a9e4b12c023b29a703e2ff55c Mon Sep 17 00:00:00 2001
From: DLBPointon <damonlbp@hotmail.co.uk>
Date: Fri, 17 Nov 2023 16:38:04 +0000
Subject: [PATCH 17/23] removed remnant file

---
 workflows/ascc_full.nf | 246 -----------------------------------------
 1 file changed, 246 deletions(-)
 delete mode 100644 workflows/ascc_full.nf

diff --git a/workflows/ascc_full.nf b/workflows/ascc_full.nf
deleted file mode 100644
index 3107122..0000000
--- a/workflows/ascc_full.nf
+++ /dev/null
@@ -1,246 +0,0 @@
-/*
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-    PRINT PARAMS SUMMARY
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-*/
-
-include { paramsSummaryLog; paramsSummaryMap } from 'plugin/nf-validation'
-
-def logo = NfcoreTemplate.logo(workflow, params.monochrome_logs)
-def citation = '\n' + WorkflowMain.citation(workflow) + '\n'
-def summary_params = paramsSummaryMap(workflow)
-
-// Print parameter summary log to screen
-log.info logo + paramsSummaryLog(workflow) + citation
-
-WorkflowAscc.initialise(params, log)
-
-/*
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-    IMPORT LOCAL MODULES/SUBWORKFLOWS
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-*/
-
-//
-// SUBWORKFLOW: Consisting of a mix of local and nf-core/modules
-//
-
-include { YAML_INPUT                    } from '../subworkflows/local/yaml_input'
-include { GENERATE_GENOME               } from '../subworkflows/local/generate_genome'
-include { EXTRACT_TIARA_HITS            } from '../subworkflows/local/extract_tiara_hits'
-include { EXTRACT_NT_BLAST              } from '../subworkflows/local/extract_nt_blast'
-include { RUN_FCSADAPTOR                } from '../subworkflows/local/run_fcsadaptor'
-include { RUN_NT_KRAKEN                 } from '../subworkflows/local/run_nt_kraken'
-include { RUN_FCSGX                     } from '../subworkflows/local/run_fcsgx'
-include { PACBIO_BARCODE_CHECK          } from '../subworkflows/local/pacbio_barcode_check'
-include { RUN_READ_COVERAGE             } from '../subworkflows/local/run_read_coverage'
-include { RUN_VECSCREEN                 } from '../subworkflows/local/run_vecscreen'
-include { ORGANELLAR_BLAST as PLASTID_ORGANELLAR_BLAST  } from '../subworkflows/local/organellar_blast'
-include { ORGANELLAR_BLAST as MITO_ORGANELLAR_BLAST     } from '../subworkflows/local/organellar_blast'
-
-
-
-//
-// MODULE: Local modules
-//
-include { GC_CONTENT                                    } from '../modules/local/gc_content'
-
-/*
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-    IMPORT NF-CORE MODULES/SUBWORKFLOWS
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-*/
-
-//
-// MODULE: Installed directly from nf-core/modules
-//
-include { CUSTOM_DUMPSOFTWAREVERSIONS                   } from '../modules/nf-core/custom/dumpsoftwareversions/main'
-
-/*
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-    RUN MAIN WORKFLOW
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-*/
-
-workflow ASCC {
-
-    main:
-    ch_versions = Channel.empty()
-
-    input_ch = Channel.fromPath(params.input, checkIfExists: true)
-
-    //
-    // SUBWORKFLOW: DECODE YAML INTO PARAMETERS FOR PIPELINE
-    //
-    YAML_INPUT (
-        input_ch
-    )
-    ch_versions = ch_versions.mix(YAML_INPUT.out.versions)
-
-    //
-    // MODULE: CALCULATE GC CONTENT PER SCAFFOLD IN INPUT FASTA
-    //
-    GC_CONTENT (
-        YAML_INPUT.out.reference_tuple
-    )
-    ch_versions = ch_versions.mix(GC_CONTENT.out.versions)
-
-    //
-    // SUBWORKFLOW: GENERATE GENOME FILE
-    //
-    GENERATE_GENOME (
-        YAML_INPUT.out.reference_tuple,
-        YAML_INPUT.out.pacbio_barcodes
-    )
-    ch_versions = ch_versions.mix(GENERATE_GENOME.out.versions)
-
-    //
-    // SUBWORKFLOW: EXTRACT RESULTS HITS FROM TIARA
-    //
-    EXTRACT_TIARA_HITS (
-        GENERATE_GENOME.out.reference_tuple
-    )
-    ch_versions = ch_versions.mix(EXTRACT_TIARA_HITS.out.versions)
-
-    //
-    // LOGIC: INJECT SLIDING WINDOW VALUES INTO REFERENCE
-    //
-    YAML_INPUT.out.reference_tuple
-        .combine ( YAML_INPUT.out.seqkit_sliding.toInteger() )
-        .combine ( YAML_INPUT.out.seqkit_window.toInteger() )
-        .map { meta, ref, sliding, window ->
-            tuple([ id      : meta.id,
-                    sliding : sliding,
-                    window  : window
-                ],
-                file(ref)
-            )}
-        .set { modified_input }
-
-    //
-    // SUBWORKFLOW: EXTRACT RESULTS HITS FROM NT-BLAST
-    //
-/*     EXTRACT_NT_BLAST (
-        modified_input,
-        YAML_INPUT.out.nt_database,
-        YAML_INPUT.out.ncbi_accessions,
-        YAML_INPUT.out.ncbi_rankedlineage_path
-    )
-    ch_versions = ch_versions.mix(EXTRACT_NT_BLAST.out.versions) */
-
-    //
-    // LOGIC: CHECK WHETHER THERE IS A MITO AND BRANCH
-    //
-    YAML_INPUT.out.mito_tuple
-        .branch { meta, check ->
-            valid:      check != "NO MITO"
-            invalid:    check == "NO MITO"
-        }
-        .set { mito_check }
-
-    //
-    // SUBWORKFLOW: BLASTING FOR MITO ASSEMBLIES IN GENOME
-    //
-    MITO_ORGANELLAR_BLAST (
-        YAML_INPUT.out.reference_tuple,
-        YAML_INPUT.out.mito_var,
-        mito_check.valid
-    )
-    ch_versions = ch_versions.mix(MITO_ORGANELLAR_BLAST.out.versions)
-
-    //
-    // LOGIC: CHECK WHETHER THERE IS A PLASTID AND BRANCH
-    //
-    YAML_INPUT.out.plastid_tuple
-        .branch { meta, check ->
-            valid:      check != "NO PLASTID"
-            invalid:    check == "NO PLASTID"
-        }
-        .set { plastid_check }
-
-    //
-    // SUBWORKFLOW: BLASTING FOR PLASTID ASSEMBLIES IN GENOME
-    //
-    PLASTID_ORGANELLAR_BLAST (
-        YAML_INPUT.out.reference_tuple,
-        YAML_INPUT.out.plastid_var,
-        plastid_check.valid
-    )
-    ch_versions = ch_versions.mix(PLASTID_ORGANELLAR_BLAST.out.versions)
-
-    //
-    // SUBWORKFLOW:
-    //
-    RUN_FCSADAPTOR (
-        YAML_INPUT.out.reference_tuple
-    )
-    ch_versions = ch_versions.mix(RUN_FCSADAPTOR.out.versions)
-
-    //
-    // SUBWORKFLOW:
-    //
-    RUN_FCSGX (
-        YAML_INPUT.out.reference_tuple,
-        YAML_INPUT.out.fcs_gx_database_path,
-        YAML_INPUT.out.taxid,
-        YAML_INPUT.out.ncbi_rankedlineage_path
-    )
-    ch_versions = ch_versions.mix(RUN_FCSADAPTOR.out.versions)
-
-    //
-    // SUBWORKFLOW: IDENTITY PACBIO BARCODES IN INPUT DATA
-    //
-    /*PACBIO_BARCODE_CHECK (
-        YAML_INPUT.out.reference_tuple,
-        YAML_INPUT.out.pacbio_tuple,
-        YAML_INPUT.out.pacbio_barcodes,
-        YAML_INPUT.out.pacbio_multiplex_codes
-    )
-    ch_versions = ch_versions.mix(PACBIO_BARCODE_CHECK.out.versions)*/
-
-    //
-    // SUBWORKFLOW: CALCULATE AVERAGE READ COVERAGE
-    //
-    RUN_READ_COVERAGE (
-        YAML_INPUT.out.reference_tuple,
-        YAML_INPUT.out.assembly_path,
-        YAML_INPUT.out.pacbio_tuple,
-        YAML_INPUT.out.reads_type
-    )
-    ch_versions = ch_versions.mix(RUN_READ_COVERAGE.out.versions)
-
-    //
-    // SUBWORKFLOW: COLLECT SOFTWARE VERSIONS
-    //
-    CUSTOM_DUMPSOFTWAREVERSIONS (
-        ch_versions.unique().collectFile(name: 'collated_versions.yml')
-    )
-
-    emit:
-    software_ch = CUSTOM_DUMPSOFTWAREVERSIONS.out.yml
-    versions_ch = CUSTOM_DUMPSOFTWAREVERSIONS.out.versions
-}
-
-/*
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-    COMPLETION EMAIL AND SUMMARY
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-*/
-
-workflow.onComplete {
-    if (params.email || params.email_on_fail) {
-        NfcoreTemplate.email(workflow, params, summary_params, projectDir, log)
-    }
-    NfcoreTemplate.summary(workflow, params, log)
-    if (params.hook_url) {
-        NfcoreTemplate.IM_notification(workflow, params, summary_params, projectDir, log)
-    }
-    // TreeValProject.summary(workflow, reference_tuple, summary_params, projectDir)
-
-}
-
-/*
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-    THE END
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-*/

From 005c13e11444c225359e7f9ccb1955a6db91e674 Mon Sep 17 00:00:00 2001
From: DLBPointon <damonlbp@hotmail.co.uk>
Date: Fri, 17 Nov 2023 16:59:42 +0000
Subject: [PATCH 18/23] Missed a couple of comments

---
 workflows/ascc.nf | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/workflows/ascc.nf b/workflows/ascc.nf
index c61f2b7..014b34a 100644
--- a/workflows/ascc.nf
+++ b/workflows/ascc.nf
@@ -77,12 +77,10 @@ workflow ASCC {
     //
     // MODULE: CALCULATE GC CONTENT PER SCAFFOLD IN INPUT FASTA
     //
-    /*
     GC_CONTENT (
         YAML_INPUT.out.reference_tuple
     )
     ch_versions = ch_versions.mix(GC_CONTENT.out.versions)
-    */
 
     //
     // SUBWORKFLOW: GENERATE GENOME FILE
@@ -96,13 +94,10 @@ workflow ASCC {
     //
     // SUBWORKFLOW: EXTRACT RESULTS HITS FROM TIARA
     //
-
-    /*
     EXTRACT_TIARA_HITS (
         GENERATE_GENOME.out.reference_tuple
     )
     ch_versions = ch_versions.mix(EXTRACT_TIARA_HITS.out.versions)
-    */
 
     //
     // LOGIC: INJECT SLIDING WINDOW VALUES INTO REFERENCE

From 4669b12d8b80913e4b2406734d24a7fe470c174d Mon Sep 17 00:00:00 2001
From: yumisims <yy5@sanger.ac.uk>
Date: Fri, 17 Nov 2023 22:36:49 +0000
Subject: [PATCH 19/23] add wildcard to vecscreen output

---
 bin/parse_fcsgx_result.py                 | 79 +++++++++++------------
 modules/local/filter_vecscreen_results.nf |  2 +-
 2 files changed, 40 insertions(+), 41 deletions(-)

diff --git a/bin/parse_fcsgx_result.py b/bin/parse_fcsgx_result.py
index 5f3f8bc..bc15372 100755
--- a/bin/parse_fcsgx_result.py
+++ b/bin/parse_fcsgx_result.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python3
 """
 Script for parsing the result files of FCS-GX, originally written by Eerik Aunin (ea10)
-further minor modifications by Yumi Sims (yy5)
+further refactoring/modifications by Yumi Sims (yy5)
 """
 
 import general_purpose_functions as gpf
@@ -42,51 +42,50 @@ def load_taxids_data(taxonomy_file):
     """
     Parses the *.taxonomy.rpt to find taxids that correspond to species names. Returns this as a dictionary
     """
-    taxonomy_data = gpf.l(taxonomy_file)
-    assert len(taxonomy_data) > 2
-    taxonomy_data = taxonomy_data[2 : len(taxonomy_data)]
+    taxonomy_data = gpf.l(taxonomy_file)[2:]
+    assert taxonomy_data, "Taxonomy data is empty"
+
     collection_dict = OrderedDict()
 
     for line in taxonomy_data:
-        add_new_entry = False
-        multiple_divs_per_scaff = False
         split_line = line.split("\t")
         assert len(split_line) == 34
-        scaff = split_line[0]
-        if "~" in scaff:
-            scaff = scaff.split("~")[0]
-        tax_name_1 = split_line[5]
-        tax_id_1 = split_line[6]
-        div_1 = split_line[7]
-        cvg_by_div_1 = split_line[8]
-        if cvg_by_div_1 != "":
-            cvg_by_div_1 = int(cvg_by_div_1)
-        cvg_by_tax_1 = split_line[9]
-        if cvg_by_tax_1 != "":
-            cvg_by_tax_1 = int(cvg_by_tax_1)
-        score_1 = split_line[10]
-        if score_1 != "":
-            score_1 = int(score_1)
-        else:
-            score_1 = 0
-        if scaff in collection_dict:
-            old_score = collection_dict[scaff]["fcs_gx_score"]
-            if old_score < score_1:
-                add_new_entry = True
-                multiple_divs_per_scaff = True
-        else:
-            add_new_entry = True
-        if add_new_entry is True:
-            row_dict = dict()
-            row_dict["fcs_gx_top_tax_name"] = tax_name_1
-            row_dict["fcs_gx_top_taxid"] = tax_id_1
-            row_dict["fcs_gx_div"] = div_1
-            row_dict["fcs_gx_coverage_by_div"] = cvg_by_div_1
-            row_dict["fcs_gx_coverage_by_tax"] = cvg_by_tax_1
-            row_dict["fcs_gx_score"] = score_1
-            row_dict["fcs_gx_multiple_divs_per_scaff"] = multiple_divs_per_scaff
-            row_dict["fcs_gx_action"] = "NA"
+
+        scaff = split_line[0].split("~")[0]
+        tax_name_1, tax_id_1, div_1, cvg_by_div_1, cvg_by_tax_1, score_1 = (
+            split_line[5],
+            split_line[6],
+            split_line[7],
+            int(split_line[8]) if split_line[8] else None,
+            int(split_line[9]) if split_line[9] else None,
+            int(split_line[10]) if split_line[10] else 0,
+        )
+
+        if scaff in collection_dict and collection_dict[scaff]["fcs_gx_score"] < score_1:
+            row_dict = {
+                "fcs_gx_top_tax_name": tax_name_1,
+                "fcs_gx_top_taxid": tax_id_1,
+                "fcs_gx_div": div_1,
+                "fcs_gx_coverage_by_div": cvg_by_div_1,
+                "fcs_gx_coverage_by_tax": cvg_by_tax_1,
+                "fcs_gx_score": score_1,
+                "fcs_gx_multiple_divs_per_scaff": True,
+                "fcs_gx_action": "NA",
+            }
             collection_dict[scaff] = row_dict
+        elif scaff not in collection_dict:
+            row_dict = {
+                "fcs_gx_top_tax_name": tax_name_1,
+                "fcs_gx_top_taxid": tax_id_1,
+                "fcs_gx_div": div_1,
+                "fcs_gx_coverage_by_div": cvg_by_div_1,
+                "fcs_gx_coverage_by_tax": cvg_by_tax_1,
+                "fcs_gx_score": score_1,
+                "fcs_gx_multiple_divs_per_scaff": False,
+                "fcs_gx_action": "NA",
+            }
+            collection_dict[scaff] = row_dict
+
     return collection_dict
 
 
diff --git a/modules/local/filter_vecscreen_results.nf b/modules/local/filter_vecscreen_results.nf
index 9cb1b3b..cf1863e 100644
--- a/modules/local/filter_vecscreen_results.nf
+++ b/modules/local/filter_vecscreen_results.nf
@@ -11,7 +11,7 @@ process FILTER_VECSCREEN_RESULTS {
     tuple val(meta), path(vecscreen_outfile)
 
     output:
-    tuple val(meta), path("vecscreen.grepped.out"), emit: filtered_vecscreen_outfile
+    tuple val(meta), path("*vecscreen.grepped.out"), emit: filtered_vecscreen_outfile
     path "versions.yml"                           , emit: versions
 
     when:

From 946cf0ed619730dc638cc0547e18a2f20c6f3287 Mon Sep 17 00:00:00 2001
From: yumisims <yy5@sanger.ac.uk>
Date: Fri, 17 Nov 2023 23:01:37 +0000
Subject: [PATCH 20/23] remove standard error output to pass github test

---
 bin/pacbio_barcode_check.py | 33 ++++++++++++++++++++++-----------
 1 file changed, 22 insertions(+), 11 deletions(-)

diff --git a/bin/pacbio_barcode_check.py b/bin/pacbio_barcode_check.py
index 5103d5b..fe41264 100755
--- a/bin/pacbio_barcode_check.py
+++ b/bin/pacbio_barcode_check.py
@@ -13,6 +13,8 @@
 Originally written by Eerik Aunin @eeaunin
 
 Adapted by Damon-Lee Pointon @DLBPointon
+
+Refactored by Yumi Sims
 """
 
 from pathlib import Path
@@ -45,19 +47,28 @@ def detect_barcodes_from_read_file_names(barcodes_fasta_path, pacbio_read_files)
 
 def check_if_barcodes_exist_in_barcodes_fasta(barcodes_list, barcodes_fasta_path):
     """
-    Checks if the specified barcodes exist in the barcode sequences FASTA file, exits with an error message if a barcode is not found
+    Checks if the specified barcodes exist in the barcode sequences FASTA file, prints a message for each missing barcode.
     """
     barcodes_fasta_data = gpf.l(barcodes_fasta_path)
-    barcode_names_in_fasta = [n.split(">")[1] for n in barcodes_fasta_data if n.startswith(">")]
-    for barcode in barcodes_list:
-        if barcode not in barcode_names_in_fasta:
-            sys.stderr.write(
-                f"The PacBio multiplexing barcode ({barcode}) was not found in the barcode sequences file ({barcodes_fasta_path})\n"
-            )
-            sys.exit(1)
-
-    # If this print statement is reached, all user-supplied codes are present.
-    print("The query barcodes exist in the barcodes database file")
+    barcode_names_in_fasta = [n.split(">")[1].strip() for n in barcodes_fasta_data if n.startswith(">")]
+
+    missing_barcodes = [barcode for barcode in barcodes_list if barcode not in barcode_names_in_fasta]
+
+    print(
+        "\n".join(
+            [
+                f"Warning: The PacBio multiplexing barcode ({barcode}) was not found in the barcode sequences file ({barcodes_fasta_path})"
+                for barcode in missing_barcodes
+            ]
+        )
+    )
+
+    if missing_barcodes:
+        print(
+            f"\nSummary: Some barcodes were not found in the barcode sequences file.\nMissing barcodes: {', '.join(missing_barcodes)}"
+        )
+    else:
+        print("\nAll query barcodes exist in the barcode sequences file.")
 
 
 def main(barcodes_fasta_path, pacbio_read_files, pacbio_multiplexing_barcode_names):

From 05612ddc40a1509951fc3ae7441ea3749bdf21a3 Mon Sep 17 00:00:00 2001
From: DLBPointon <damonlbp@hotmail.co.uk>
Date: Mon, 20 Nov 2023 13:05:57 +0000
Subject: [PATCH 21/23] Reverting changes to pacbio_barcode_check and adding
 barcodes found in test to test.yaml

---
 assets/test.yaml            |  2 +-
 bin/pacbio_barcode_check.py | 33 +++++++++++----------------------
 2 files changed, 12 insertions(+), 23 deletions(-)

diff --git a/assets/test.yaml b/assets/test.yaml
index 2f3f9e1..e9eafa2 100755
--- a/assets/test.yaml
+++ b/assets/test.yaml
@@ -3,7 +3,7 @@ assembly_title: asccTinyTest
 reads_path: /lustre/scratch123/tol/resources/treeval/treeval-testdata/asccTinyTest/pacbio/
 reads_type: "hifi"
 pacbio_barcodes: /nfs/treeoflife-01/teams/tola/users/dp24/ascc/assets/pacbio_adaptors.fa
-pacbio_multiplexing_barcode_names: "bc1008,bc1009"
+pacbio_multiplexing_barcode_names: "bc2008,bc2009"
 sci_name: "Plasmodium yoelii yoelii 17XNL"
 taxid: 352914
 mito_fasta_path: /nfs/treeoflife-01/teams/tola/users/dp24/ascc/asccTinyTest/organellar/Pyoeliiyoelii17XNL_mitochondrion_ncbi.fa
diff --git a/bin/pacbio_barcode_check.py b/bin/pacbio_barcode_check.py
index fe41264..5103d5b 100755
--- a/bin/pacbio_barcode_check.py
+++ b/bin/pacbio_barcode_check.py
@@ -13,8 +13,6 @@
 Originally written by Eerik Aunin @eeaunin
 
 Adapted by Damon-Lee Pointon @DLBPointon
-
-Refactored by Yumi Sims
 """
 
 from pathlib import Path
@@ -47,28 +45,19 @@ def detect_barcodes_from_read_file_names(barcodes_fasta_path, pacbio_read_files)
 
 def check_if_barcodes_exist_in_barcodes_fasta(barcodes_list, barcodes_fasta_path):
     """
-    Checks if the specified barcodes exist in the barcode sequences FASTA file, prints a message for each missing barcode.
+    Checks if the specified barcodes exist in the barcode sequences FASTA file, exits with an error message if a barcode is not found
     """
     barcodes_fasta_data = gpf.l(barcodes_fasta_path)
-    barcode_names_in_fasta = [n.split(">")[1].strip() for n in barcodes_fasta_data if n.startswith(">")]
-
-    missing_barcodes = [barcode for barcode in barcodes_list if barcode not in barcode_names_in_fasta]
-
-    print(
-        "\n".join(
-            [
-                f"Warning: The PacBio multiplexing barcode ({barcode}) was not found in the barcode sequences file ({barcodes_fasta_path})"
-                for barcode in missing_barcodes
-            ]
-        )
-    )
-
-    if missing_barcodes:
-        print(
-            f"\nSummary: Some barcodes were not found in the barcode sequences file.\nMissing barcodes: {', '.join(missing_barcodes)}"
-        )
-    else:
-        print("\nAll query barcodes exist in the barcode sequences file.")
+    barcode_names_in_fasta = [n.split(">")[1] for n in barcodes_fasta_data if n.startswith(">")]
+    for barcode in barcodes_list:
+        if barcode not in barcode_names_in_fasta:
+            sys.stderr.write(
+                f"The PacBio multiplexing barcode ({barcode}) was not found in the barcode sequences file ({barcodes_fasta_path})\n"
+            )
+            sys.exit(1)
+
+    # If this print statement is reached, all user-supplied codes are present.
+    print("The query barcodes exist in the barcodes database file")
 
 
 def main(barcodes_fasta_path, pacbio_read_files, pacbio_multiplexing_barcode_names):

From 79a804a3c0ef8c8dc77df4200d61d22e2d36cbfc Mon Sep 17 00:00:00 2001
From: DLBPointon <damonlbp@hotmail.co.uk>
Date: Mon, 20 Nov 2023 13:11:29 +0000
Subject: [PATCH 22/23] Correct the other test.yaml for github testing

---
 assets/github_testing/test.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/assets/github_testing/test.yaml b/assets/github_testing/test.yaml
index 7b35de1..c5a8bbf 100755
--- a/assets/github_testing/test.yaml
+++ b/assets/github_testing/test.yaml
@@ -1,7 +1,7 @@
 assembly_path: /home/runner/work/ascc/ascc/asccTinyTest/assembly/Pyoeliiyoelii17XNL_assembly.fa
 assembly_title: asccTinyTest
 pacbio_barcodes: /home/runner/work/ascc/ascc/pacbio_barcode/pacbio_adaptors.fa
-pacbio_multiplexing_barcode_names: "bc1008,bc1009"
+pacbio_multiplexing_barcode_names: "bc2008,bc2009"
 reads_path: /home/runner/work/ascc/ascc/asccTinyTest/pacbio
 reads_type: "hifi"
 sci_name: "Plasmodium yoelii yoelii 17XNL"

From afbbde76f75acafe6bf3c44917fa2a4921eae77d Mon Sep 17 00:00:00 2001
From: DLBPointon <damonlbp@hotmail.co.uk>
Date: Mon, 20 Nov 2023 14:48:59 +0000
Subject: [PATCH 23/23] Fixed chunk assembly for vecscreen, added argparse for
 threshold variable

---
 bin/chunk_assembly_for_vecscreen.py           | 44 ++++++++++---------
 modules/local/chunk_assembly_for_vecscreen.nf |  3 +-
 2 files changed, 25 insertions(+), 22 deletions(-)

diff --git a/bin/chunk_assembly_for_vecscreen.py b/bin/chunk_assembly_for_vecscreen.py
index e36d1ed..bf8068a 100755
--- a/bin/chunk_assembly_for_vecscreen.py
+++ b/bin/chunk_assembly_for_vecscreen.py
@@ -10,32 +10,33 @@
 
 
 def process_record(record, chunk_num, threshold_length, overlap_length):
-    return (
-        record[chunk_num * threshold_length : ((chunk_num + 1) * threshold_length + overlap_length)]
-        if chunk_num * threshold_length < len(record) - (threshold_length + overlap_length)
-        else record[chunk_num * threshold_length :]
-    )
+    """
+    ID chunks within a certain threshold, this sequence is then excised
+    If it falls outside of the calculated threshold then rest of sequence is output
+    """
+    if chunk_num * threshold_length < len(record) - (threshold_length + overlap_length):
+        return record[chunk_num * threshold_length : ((chunk_num + 1) * threshold_length + overlap_length)]
+    else:
+        return record[chunk_num * threshold_length :]
 
 
 def generate_records_to_write(record, threshold_length, overlap_length):
-    return [
-        (
-            record_slice.id + ".chunk_{}".format(chunk_num + 1),
-            "",
-            record_slice,
-        )
-        for chunk_num, record_slice in enumerate(
-            process_record(record, i, threshold_length, overlap_length)
-            for i in range((len(record) - 1) // threshold_length + 1)
-        )
-    ]
-
-
-def main(fasta_input_file, fasta_output_file):
+    """
+    For record in input process the record
+    generate new id of id + counter
+    """
+    counter = 0
+    for i in range((len(record) - 1) // threshold_length + 1):
+        record_slice = process_record(record, i, threshold_length, overlap_length)
+        counter += 1
+        record_slice.id = record_slice.id + ".chunk_{}".format(counter)
+        return record_slice
+
+
+def main(fasta_input_file, fasta_output_file, threshold_length):
     fasta_input_file = os.path.abspath(fasta_input_file)
     fasta_output_file = os.path.abspath(fasta_output_file)
 
-    threshold_length = 500000
     overlap_length = int(threshold_length / 10)
     minimum_record_size = 11
 
@@ -54,6 +55,7 @@ def main(fasta_input_file, fasta_output_file):
     parser = argparse.ArgumentParser(description=__doc__)
     parser.add_argument("fasta_input_file", type=str, help="Path to input FASTA file")
     parser.add_argument("fasta_output_file", type=str, help="Path for FASTA output file")
+    parser.add_argument("--threshold", type=int, default=500000, help="Threshold length of sequence")
     parser.add_argument("-v", "--version", action="version", version="1.0")
     args = parser.parse_args()
-    main(args.fasta_input_file, args.fasta_output_file)
+    main(args.fasta_input_file, args.fasta_output_file, args.threshold)
diff --git a/modules/local/chunk_assembly_for_vecscreen.nf b/modules/local/chunk_assembly_for_vecscreen.nf
index 9969e42..bf7a657 100644
--- a/modules/local/chunk_assembly_for_vecscreen.nf
+++ b/modules/local/chunk_assembly_for_vecscreen.nf
@@ -19,8 +19,9 @@ process CHUNK_ASSEMBLY_FOR_VECSCREEN {
 
     script:
     def prefix  = args.ext.prefix   ?: "${meta.id}"
+    def args    = args.ext.args     ?: ""
     """
-    chunk_assembly_for_vecscreen.py $fasta_input_file ${prefix}_chunked_assembly.fa
+    chunk_assembly_for_vecscreen.py $fasta_input_file ${prefix}_chunked_assembly.fa ${args}
 
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":