From ddc2f300dfde6985bafc5fdca8c1653bdf4cc705 Mon Sep 17 00:00:00 2001
From: eeaunin <eeaunin@gmail.com>
Date: Thu, 16 Nov 2023 18:00:31 +0000
Subject: [PATCH] Deleted trailing Ns scripts and reformatted the VecScreen
 Python scripts with black

---
 bin/VSlistTo1HitPerLine.py          |  18 +++-
 bin/chunk_assembly_for_vecscreen.py |  52 +++++-----
 bin/summarise_vecscreen_output.py   |  13 ++-
 bin/trim_Ns.py                      | 144 ----------------------------
 modules/local/detect_trailing_n.nf  |  42 --------
 5 files changed, 51 insertions(+), 218 deletions(-)
 delete mode 100755 bin/trim_Ns.py
 delete mode 100644 modules/local/detect_trailing_n.nf

diff --git a/bin/VSlistTo1HitPerLine.py b/bin/VSlistTo1HitPerLine.py
index 55b8ca7..87ea39d 100755
--- a/bin/VSlistTo1HitPerLine.py
+++ b/bin/VSlistTo1HitPerLine.py
@@ -16,6 +16,7 @@
 import re
 import argparse
 
+
 def main(args):
     hits_to_report = ""
     ID = ""
@@ -27,14 +28,14 @@ def main(args):
             line = line.strip()
             Fld = line.split(" ")
 
-            if not re.match(r'^[0-9 \t]+$', line):
+            if not re.match(r"^[0-9 \t]+$", line):
                 hits_to_report = ""
 
             if hits_to_report:
                 print(f"VecScreen_{hits_to_report.ljust(8)}\t{ID}\t{line}")
                 hits += 1
                 continue
-            
+
             if line.startswith(">Vector "):
                 if ID != "":
                     if error_found:
@@ -75,13 +76,20 @@ def main(args):
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="Reformatting VecScreen's output")
-    parser.add_argument("vecscreen_output_file", type=str, help="Path to a raw output file from NCBI VecScreen (from a run with the -f3 flag)", default=None)
+    parser.add_argument(
+        "vecscreen_output_file",
+        type=str,
+        help="Path to a raw output file from NCBI VecScreen (from a run with the -f3 flag)",
+        default=None,
+    )
     parser.add_argument("--skip_reporting_strong_hits", action="store_true", help="Skip reporting strong hits")
     parser.add_argument("--skip_reporting_moderate_hits", action="store_true", help="Skip reporting moderate hits")
     parser.add_argument("--skip_reporting_weak_hits", action="store_true", help="Skip reporting weak hits")
-    parser.add_argument("--skip_reporting_suspect_hits", action="store_true", help="Skip reporting hits of suspect origin")
+    parser.add_argument(
+        "--skip_reporting_suspect_hits", action="store_true", help="Skip reporting hits of suspect origin"
+    )
     parser.add_argument("--skip_reporting_no_hits", action="store_true", help="Skip reporting no-hits")
     parser.add_argument("--skip_reporting_errors", action="store_true", help="Skip reporting errors")
     parser.add_argument("-v", "--version", action="version", version="1.0")
     args = parser.parse_args()
-    main(args)
\ No newline at end of file
+    main(args)
diff --git a/bin/chunk_assembly_for_vecscreen.py b/bin/chunk_assembly_for_vecscreen.py
index 5721745..321777c 100755
--- a/bin/chunk_assembly_for_vecscreen.py
+++ b/bin/chunk_assembly_for_vecscreen.py
@@ -7,42 +7,44 @@
 import argparse
 import os
 
-def main(fasta_input_file, fasta_output_file):
-	fasta_input_file = os.path.abspath(fasta_input_file)
-	fasta_output_file = os.path.abspath(fasta_output_file)
 
-	threshold_length = 500000
-	overlap_length = int( threshold_length / 10 )
-	minimum_record_size = 11
+def main(fasta_input_file, fasta_output_file):
+    fasta_input_file = os.path.abspath(fasta_input_file)
+    fasta_output_file = os.path.abspath(fasta_output_file)
 
-	fasta_output_handle = open(fasta_output_file, 'w')
+    threshold_length = 500000
+    overlap_length = int(threshold_length / 10)
+    minimum_record_size = 11
 
-	with open(fasta_input_file, 'r') as fasta_input_handle:
-		for record in SeqIO.parse(fasta_input_handle, "fasta"):
+    fasta_output_handle = open(fasta_output_file, "w")
 
-			if len(record) >= minimum_record_size:
-				records_to_write = []
+    with open(fasta_input_file, "r") as fasta_input_handle:
+        for record in SeqIO.parse(fasta_input_handle, "fasta"):
+            if len(record) >= minimum_record_size:
+                records_to_write = []
 
-				slice_count = 0
-				while (slice_count * threshold_length) < len(record) - (threshold_length+overlap_length):
-					record_slice = record[(slice_count*threshold_length):((slice_count+1)*threshold_length + overlap_length)]
-					record_slice.id += '.chunk_' + str(slice_count+1)
+                slice_count = 0
+                while (slice_count * threshold_length) < len(record) - (threshold_length + overlap_length):
+                    record_slice = record[
+                        (slice_count * threshold_length) : ((slice_count + 1) * threshold_length + overlap_length)
+                    ]
+                    record_slice.id += ".chunk_" + str(slice_count + 1)
 
-					record_slice.description = ''
-					records_to_write.append(record_slice)
-					slice_count += 1
+                    record_slice.description = ""
+                    records_to_write.append(record_slice)
+                    slice_count += 1
 
-				final_record_slice = record[(slice_count*threshold_length):]
+                final_record_slice = record[(slice_count * threshold_length) :]
 
-				if slice_count > 0:
-					final_record_slice.id += '.chunk_' + str(slice_count+1)
-				final_record_slice.description = ''
+                if slice_count > 0:
+                    final_record_slice.id += ".chunk_" + str(slice_count + 1)
+                final_record_slice.description = ""
 
-				records_to_write.append(final_record_slice)
+                records_to_write.append(final_record_slice)
 
-				SeqIO.write(records_to_write, fasta_output_handle, 'fasta')
+                SeqIO.write(records_to_write, fasta_output_handle, "fasta")
 
-	fasta_output_handle.close()
+    fasta_output_handle.close()
 
 
 if __name__ == "__main__":
diff --git a/bin/summarise_vecscreen_output.py b/bin/summarise_vecscreen_output.py
index a513f53..50cb173 100755
--- a/bin/summarise_vecscreen_output.py
+++ b/bin/summarise_vecscreen_output.py
@@ -46,8 +46,17 @@ def main(vecscreen_file, chunk_size):
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description=__doc__)
-    parser.add_argument("vecscreen_file", type=str, help="Path to output file of VecScreen (run with -f3 flag), filtered with VSlistTo1HitPerLine.awk")
-    parser.add_argument("--chunk_size", type=int, help="Chunk size of the chunked FASTA file that VecScreen was run with, in bp. Default: 500000", default=50000)
+    parser.add_argument(
+        "vecscreen_file",
+        type=str,
+        help="Path to output file of VecScreen (run with -f3 flag), filtered with VSlistTo1HitPerLine.awk",
+    )
+    parser.add_argument(
+        "--chunk_size",
+        type=int,
+        help="Chunk size of the chunked FASTA file that VecScreen was run with, in bp. Default: 500000",
+        default=50000,
+    )
     parser.add_argument("-v", "--version", action="version", version="1.0")
     args = parser.parse_args()
     main(args.vecscreen_file, args.chunk_size)
diff --git a/bin/trim_Ns.py b/bin/trim_Ns.py
deleted file mode 100755
index 3f20eb6..0000000
--- a/bin/trim_Ns.py
+++ /dev/null
@@ -1,144 +0,0 @@
-#!/usr/bin/env python3
-"""
-Script for detecting trailing Ns that should be trimmed from an assembly, from James Torrance. Edited by Eerik Aunin
-"""
-
-import re
-from Bio import SeqIO
-import argparse
-
-def main(fasta_file, output_file):
-
-	minleftover = 200   # after trimming start/end, at least this many bp should be left
-	winsize     = 5000  # for sliding window analysis
-	minslidingBase = 0.4   # maximum fraction of Ns in sliding window before alarm sets off
-
-	output_handle = open(output_file, 'w')
-	with open(fasta_file, "r") as fasta_input_handle:
-		for record in SeqIO.parse(fasta_input_handle, "fasta"):
-			# output_handle.write('Testing ' + record.id + "\n")
-
-			# Do this twice: once with the regular string, once with the reversed string
-			# The output should be as one or more variables that can be reversed for the second iteration
-			seq_string = str(record.seq)
-
-			n_count = seq_string.count('N') + seq_string.count('n')
-			n_perc = n_count/ len(seq_string)
-			if n_perc > 0.8:
-				output_handle.write('# WARNING: ' + record.id + '\t' + str(int(n_perc * 10000)/100) + ' % Ns of total ' + str(len(seq_string)) + "\n")
-
-			# Consider handling this via two separate regexps for start and end. Adding character-class might be escalating run-time
-			#terminal_n_match = re.match('^([Nn]*)(\S+?)([Nn]*)$', seq_string)
-			start_n_match = re.match('^([Nn]*)', seq_string)
-			end_n_match = re.search('^([Nn]*)', seq_string[::-1])
-
-			startseq = ''
-			if start_n_match:
-				startseq = start_n_match.group(1)
-			endseq = ''
-			if end_n_match:
-				endseq = end_n_match.group(1)
-			realseq_length = len(seq_string) - len(startseq) - len(endseq)
-			# Handle "all Ns exception"
-			if len(startseq) == len(seq_string):
-				realseq_length = 0
-				endseq = '' 
-
-			if len(startseq) > 0 or len(endseq) > 0:
-				if len(startseq) > 0:
-					output_handle.write('\t'.join(["TRIM:",record.id,'1',str(len(startseq))]) + "\t\n")
-				if len(endseq) > 0:
-					output_handle.write('\t'.join(["TRIM:",record.id,str(len(startseq)+realseq_length+1),str(len(seq_string))]) + "\t\n")
-
-			if realseq_length <= minleftover:
-				output_handle.write("REMOVE: " + record.id + '\t' + str(realseq_length) + " bp leftover after trimming\n")
-			
-
-			# Only attempt the windowing approach if there's an initial window that doesn't meet the trigger condition
-			for seq_string_loop, seq_string_for_window in enumerate([seq_string, seq_string[::-1]]):
-				if len(seq_string_for_window) > winsize and (seq_string_for_window[:winsize].count('N') + seq_string_for_window[:winsize].count('n')) > winsize * minslidingBase:
-
-					non_n_regions = []
-					non_n_iterator = re.finditer('[^Nn]+', seq_string_for_window)
-					for non_n_instance in non_n_iterator:
-						# output_handle.write(record.id + '\t' + str(non_n_instance.start(0)) + '\t' + str(non_n_instance.end(0)) + '\n')
-
-						current_non_n_start = non_n_instance.start(0) + 1
-						current_non_n_end = non_n_instance.end(0)
-
-						non_n_regions.insert(0, [current_non_n_start, current_non_n_end])
-
-						# Does the *end* of this block satisfy the window condition?
-						bases_in_window = 0
-						start_of_first_base_block_in_window = None
-						for non_n_region in non_n_regions:
-							if non_n_region[1] >= current_non_n_end - winsize:
-								start_of_first_base_block_in_window = non_n_region[0]
-								if non_n_region[0] >= current_non_n_end - winsize:
-									bases_in_window += non_n_region[1] - non_n_region[0] + 1
-								else:
-									bases_in_window += non_n_region[1] - non_n_region[0] + 1
-									break
-							else:
-								break
-
-						if bases_in_window >= minslidingBase * winsize:
-							# Because start positions have BED-style
-
-							# Remember that the blocks are in *reverse* order along the sequence
-							trackback_to_start_flag = False
-
-							for i, non_n_region in enumerate(non_n_regions):
-								if i == 0:
-									continue
-								bases_in_window_2 = 0
-								if non_n_region[1] < non_n_regions[0][0] - winsize:
-									break
-								else:
-									current_window_start = max(non_n_region[0], non_n_regions[0][0] - winsize)
-
-									# Count the bases from this block
-									bases_in_window_2 += min(non_n_region[1], (current_window_start+winsize-1) ) - current_window_start + 1
-
-									# Add up all the sequence in blocks tested thus far, but not the final block
-									for j in range(1, i):
-										bases_in_window_2 += non_n_regions[j][1] - non_n_regions[j][0] + 1
-
-									# Count the sequence in the final block that can contribute
-									# This is the region between the start of the final block and either the end of the block
-									# or the end of a window extending from the current test start point, whichever comes first
-									bases_in_window_2 += min( non_n_regions[0][1], (current_window_start + winsize-1) ) - non_n_regions[0][0] + 1
-
-									# output_handle.write('BIW: ' + str(i) + ' ' + str(bases_in_window_2) + "\n")
-									if bases_in_window_2 >= minslidingBase * winsize:
-										if current_window_start == non_n_region[0]:
-											start_of_first_base_block_in_window = current_window_start
-										else:
-											start_of_first_base_block_in_window = non_n_regions[i-1][0]
-									else:
-										# We keep going back until the threshold condition is *not* met
-										break
-
-									# If we find the break-point should be before the first block, then we don't want to trim at all!										
-									if i == len(non_n_regions) - 1:
-										trackback_to_start_flag = True
-
-
-							# Only trim if the breakpoint isn't before the first block
-							# and if the breakpoint isn't at the start of the sequence
-							if not(trackback_to_start_flag) and start_of_first_base_block_in_window != 1:
-								if seq_string_loop == 0:
-									output_handle.write("FWDCLIP:\t" + record.id + "\t1\t" + str(start_of_first_base_block_in_window - 1) + "\n")
-								else:
-									output_handle.write("REVCLIP:\t" + record.id + '\t' + str(len(seq_string) - start_of_first_base_block_in_window + 2) + '\t' + str(len(seq_string)) + "\n")
-
-							break
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description=__doc__)
-    parser.add_argument("fasta_file", type=str, help="Path to input FASTA file")
-    parser.add_argument("output_file", type=str, help="Path for output report file")
-    parser.add_argument("-v", "--version", action="version", version="1.0")
-    args = parser.parse_args()
-    main(args.fasta_file, args.output_file)
\ No newline at end of file
diff --git a/modules/local/detect_trailing_n.nf b/modules/local/detect_trailing_n.nf
deleted file mode 100644
index 0ddf966..0000000
--- a/modules/local/detect_trailing_n.nf
+++ /dev/null
@@ -1,42 +0,0 @@
-process DETECT_TRAILING_N {
-    tag "$meta.id"
-    label 'process_single'
-
-    conda "bioconda::biopython=1.70"
-    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
-        'https://depot.galaxyproject.org/singularity/biopython:1.70--np112py35_1':
-        'quay.io/biocontainers/biopython:1.70--np112py35_1' }"
-
-    input:
-    tuple val(meta), path(fasta_input_file)
-
-    output:
-    tuple val(meta), path("*_trim_Ns"), emit: trailing_n_report
-    path "versions.yml"           , emit: versions
-
-    when:
-    task.ext.when == null || task.ext.when
-
-    script:
-    def prefix = task.ext.prefix ?: "${meta.id}"
-    """
-    trim_Ns.py ${fasta_input_file} ${prefix}_trim_Ns
-    cat <<-END_VERSIONS > versions.yml
-    "${task.process}":
-        python: \$(python --version | sed 's/Python //g')
-        trim_Ns.py: \$(trim_Ns.py --version | cut -d' ' -f2)
-    END_VERSIONS
-    """
-
-    stub:
-    def prefix = task.ext.prefix ?: "${meta.id}"
-    """
-    touch ${prefix}_trim_Ns
-
-    cat <<-END_VERSIONS > versions.yml
-    "${task.process}":
-        python: \$(python --version | sed 's/Python //g')
-        trim_Ns.py: \$(trim_Ns.py --version | cut -d' ' -f2)
-    END_VERSIONS
-    """
-}
\ No newline at end of file