From ddc2f300dfde6985bafc5fdca8c1653bdf4cc705 Mon Sep 17 00:00:00 2001 From: eeaunin Date: Thu, 16 Nov 2023 18:00:31 +0000 Subject: [PATCH] Deleted trailing Ns scripts and reformatted the VecScreen Python scripts with black --- bin/VSlistTo1HitPerLine.py | 18 +++- bin/chunk_assembly_for_vecscreen.py | 52 +++++----- bin/summarise_vecscreen_output.py | 13 ++- bin/trim_Ns.py | 144 ---------------------------- modules/local/detect_trailing_n.nf | 42 -------- 5 files changed, 51 insertions(+), 218 deletions(-) delete mode 100755 bin/trim_Ns.py delete mode 100644 modules/local/detect_trailing_n.nf diff --git a/bin/VSlistTo1HitPerLine.py b/bin/VSlistTo1HitPerLine.py index 55b8ca7..87ea39d 100755 --- a/bin/VSlistTo1HitPerLine.py +++ b/bin/VSlistTo1HitPerLine.py @@ -16,6 +16,7 @@ import re import argparse + def main(args): hits_to_report = "" ID = "" @@ -27,14 +28,14 @@ def main(args): line = line.strip() Fld = line.split(" ") - if not re.match(r'^[0-9 \t]+$', line): + if not re.match(r"^[0-9 \t]+$", line): hits_to_report = "" if hits_to_report: print(f"VecScreen_{hits_to_report.ljust(8)}\t{ID}\t{line}") hits += 1 continue - + if line.startswith(">Vector "): if ID != "": if error_found: @@ -75,13 +76,20 @@ def main(args): if __name__ == "__main__": parser = argparse.ArgumentParser(description="Reformatting VecScreen's output") - parser.add_argument("vecscreen_output_file", type=str, help="Path to a raw output file from NCBI VecScreen (from a run with the -f3 flag)", default=None) + parser.add_argument( + "vecscreen_output_file", + type=str, + help="Path to a raw output file from NCBI VecScreen (from a run with the -f3 flag)", + default=None, + ) parser.add_argument("--skip_reporting_strong_hits", action="store_true", help="Skip reporting strong hits") parser.add_argument("--skip_reporting_moderate_hits", action="store_true", help="Skip reporting moderate hits") parser.add_argument("--skip_reporting_weak_hits", action="store_true", help="Skip reporting weak hits") - parser.add_argument("--skip_reporting_suspect_hits", action="store_true", help="Skip reporting hits of suspect origin") + parser.add_argument( + "--skip_reporting_suspect_hits", action="store_true", help="Skip reporting hits of suspect origin" + ) parser.add_argument("--skip_reporting_no_hits", action="store_true", help="Skip reporting no-hits") parser.add_argument("--skip_reporting_errors", action="store_true", help="Skip reporting errors") parser.add_argument("-v", "--version", action="version", version="1.0") args = parser.parse_args() - main(args) \ No newline at end of file + main(args) diff --git a/bin/chunk_assembly_for_vecscreen.py b/bin/chunk_assembly_for_vecscreen.py index 5721745..321777c 100755 --- a/bin/chunk_assembly_for_vecscreen.py +++ b/bin/chunk_assembly_for_vecscreen.py @@ -7,42 +7,44 @@ import argparse import os -def main(fasta_input_file, fasta_output_file): - fasta_input_file = os.path.abspath(fasta_input_file) - fasta_output_file = os.path.abspath(fasta_output_file) - threshold_length = 500000 - overlap_length = int( threshold_length / 10 ) - minimum_record_size = 11 +def main(fasta_input_file, fasta_output_file): + fasta_input_file = os.path.abspath(fasta_input_file) + fasta_output_file = os.path.abspath(fasta_output_file) - fasta_output_handle = open(fasta_output_file, 'w') + threshold_length = 500000 + overlap_length = int(threshold_length / 10) + minimum_record_size = 11 - with open(fasta_input_file, 'r') as fasta_input_handle: - for record in SeqIO.parse(fasta_input_handle, "fasta"): + fasta_output_handle = open(fasta_output_file, "w") - if len(record) >= minimum_record_size: - records_to_write = [] + with open(fasta_input_file, "r") as fasta_input_handle: + for record in SeqIO.parse(fasta_input_handle, "fasta"): + if len(record) >= minimum_record_size: + records_to_write = [] - slice_count = 0 - while (slice_count * threshold_length) < len(record) - (threshold_length+overlap_length): - record_slice = record[(slice_count*threshold_length):((slice_count+1)*threshold_length + overlap_length)] - record_slice.id += '.chunk_' + str(slice_count+1) + slice_count = 0 + while (slice_count * threshold_length) < len(record) - (threshold_length + overlap_length): + record_slice = record[ + (slice_count * threshold_length) : ((slice_count + 1) * threshold_length + overlap_length) + ] + record_slice.id += ".chunk_" + str(slice_count + 1) - record_slice.description = '' - records_to_write.append(record_slice) - slice_count += 1 + record_slice.description = "" + records_to_write.append(record_slice) + slice_count += 1 - final_record_slice = record[(slice_count*threshold_length):] + final_record_slice = record[(slice_count * threshold_length) :] - if slice_count > 0: - final_record_slice.id += '.chunk_' + str(slice_count+1) - final_record_slice.description = '' + if slice_count > 0: + final_record_slice.id += ".chunk_" + str(slice_count + 1) + final_record_slice.description = "" - records_to_write.append(final_record_slice) + records_to_write.append(final_record_slice) - SeqIO.write(records_to_write, fasta_output_handle, 'fasta') + SeqIO.write(records_to_write, fasta_output_handle, "fasta") - fasta_output_handle.close() + fasta_output_handle.close() if __name__ == "__main__": diff --git a/bin/summarise_vecscreen_output.py b/bin/summarise_vecscreen_output.py index a513f53..50cb173 100755 --- a/bin/summarise_vecscreen_output.py +++ b/bin/summarise_vecscreen_output.py @@ -46,8 +46,17 @@ def main(vecscreen_file, chunk_size): if __name__ == "__main__": parser = argparse.ArgumentParser(description=__doc__) - parser.add_argument("vecscreen_file", type=str, help="Path to output file of VecScreen (run with -f3 flag), filtered with VSlistTo1HitPerLine.awk") - parser.add_argument("--chunk_size", type=int, help="Chunk size of the chunked FASTA file that VecScreen was run with, in bp. Default: 500000", default=50000) + parser.add_argument( + "vecscreen_file", + type=str, + help="Path to output file of VecScreen (run with -f3 flag), filtered with VSlistTo1HitPerLine.awk", + ) + parser.add_argument( + "--chunk_size", + type=int, + help="Chunk size of the chunked FASTA file that VecScreen was run with, in bp. Default: 500000", + default=50000, + ) parser.add_argument("-v", "--version", action="version", version="1.0") args = parser.parse_args() main(args.vecscreen_file, args.chunk_size) diff --git a/bin/trim_Ns.py b/bin/trim_Ns.py deleted file mode 100755 index 3f20eb6..0000000 --- a/bin/trim_Ns.py +++ /dev/null @@ -1,144 +0,0 @@ -#!/usr/bin/env python3 -""" -Script for detecting trailing Ns that should be trimmed from an assembly, from James Torrance. Edited by Eerik Aunin -""" - -import re -from Bio import SeqIO -import argparse - -def main(fasta_file, output_file): - - minleftover = 200 # after trimming start/end, at least this many bp should be left - winsize = 5000 # for sliding window analysis - minslidingBase = 0.4 # maximum fraction of Ns in sliding window before alarm sets off - - output_handle = open(output_file, 'w') - with open(fasta_file, "r") as fasta_input_handle: - for record in SeqIO.parse(fasta_input_handle, "fasta"): - # output_handle.write('Testing ' + record.id + "\n") - - # Do this twice: once with the regular string, once with the reversed string - # The output should be as one or more variables that can be reversed for the second iteration - seq_string = str(record.seq) - - n_count = seq_string.count('N') + seq_string.count('n') - n_perc = n_count/ len(seq_string) - if n_perc > 0.8: - output_handle.write('# WARNING: ' + record.id + '\t' + str(int(n_perc * 10000)/100) + ' % Ns of total ' + str(len(seq_string)) + "\n") - - # Consider handling this via two separate regexps for start and end. Adding character-class might be escalating run-time - #terminal_n_match = re.match('^([Nn]*)(\S+?)([Nn]*)$', seq_string) - start_n_match = re.match('^([Nn]*)', seq_string) - end_n_match = re.search('^([Nn]*)', seq_string[::-1]) - - startseq = '' - if start_n_match: - startseq = start_n_match.group(1) - endseq = '' - if end_n_match: - endseq = end_n_match.group(1) - realseq_length = len(seq_string) - len(startseq) - len(endseq) - # Handle "all Ns exception" - if len(startseq) == len(seq_string): - realseq_length = 0 - endseq = '' - - if len(startseq) > 0 or len(endseq) > 0: - if len(startseq) > 0: - output_handle.write('\t'.join(["TRIM:",record.id,'1',str(len(startseq))]) + "\t\n") - if len(endseq) > 0: - output_handle.write('\t'.join(["TRIM:",record.id,str(len(startseq)+realseq_length+1),str(len(seq_string))]) + "\t\n") - - if realseq_length <= minleftover: - output_handle.write("REMOVE: " + record.id + '\t' + str(realseq_length) + " bp leftover after trimming\n") - - - # Only attempt the windowing approach if there's an initial window that doesn't meet the trigger condition - for seq_string_loop, seq_string_for_window in enumerate([seq_string, seq_string[::-1]]): - if len(seq_string_for_window) > winsize and (seq_string_for_window[:winsize].count('N') + seq_string_for_window[:winsize].count('n')) > winsize * minslidingBase: - - non_n_regions = [] - non_n_iterator = re.finditer('[^Nn]+', seq_string_for_window) - for non_n_instance in non_n_iterator: - # output_handle.write(record.id + '\t' + str(non_n_instance.start(0)) + '\t' + str(non_n_instance.end(0)) + '\n') - - current_non_n_start = non_n_instance.start(0) + 1 - current_non_n_end = non_n_instance.end(0) - - non_n_regions.insert(0, [current_non_n_start, current_non_n_end]) - - # Does the *end* of this block satisfy the window condition? - bases_in_window = 0 - start_of_first_base_block_in_window = None - for non_n_region in non_n_regions: - if non_n_region[1] >= current_non_n_end - winsize: - start_of_first_base_block_in_window = non_n_region[0] - if non_n_region[0] >= current_non_n_end - winsize: - bases_in_window += non_n_region[1] - non_n_region[0] + 1 - else: - bases_in_window += non_n_region[1] - non_n_region[0] + 1 - break - else: - break - - if bases_in_window >= minslidingBase * winsize: - # Because start positions have BED-style - - # Remember that the blocks are in *reverse* order along the sequence - trackback_to_start_flag = False - - for i, non_n_region in enumerate(non_n_regions): - if i == 0: - continue - bases_in_window_2 = 0 - if non_n_region[1] < non_n_regions[0][0] - winsize: - break - else: - current_window_start = max(non_n_region[0], non_n_regions[0][0] - winsize) - - # Count the bases from this block - bases_in_window_2 += min(non_n_region[1], (current_window_start+winsize-1) ) - current_window_start + 1 - - # Add up all the sequence in blocks tested thus far, but not the final block - for j in range(1, i): - bases_in_window_2 += non_n_regions[j][1] - non_n_regions[j][0] + 1 - - # Count the sequence in the final block that can contribute - # This is the region between the start of the final block and either the end of the block - # or the end of a window extending from the current test start point, whichever comes first - bases_in_window_2 += min( non_n_regions[0][1], (current_window_start + winsize-1) ) - non_n_regions[0][0] + 1 - - # output_handle.write('BIW: ' + str(i) + ' ' + str(bases_in_window_2) + "\n") - if bases_in_window_2 >= minslidingBase * winsize: - if current_window_start == non_n_region[0]: - start_of_first_base_block_in_window = current_window_start - else: - start_of_first_base_block_in_window = non_n_regions[i-1][0] - else: - # We keep going back until the threshold condition is *not* met - break - - # If we find the break-point should be before the first block, then we don't want to trim at all! - if i == len(non_n_regions) - 1: - trackback_to_start_flag = True - - - # Only trim if the breakpoint isn't before the first block - # and if the breakpoint isn't at the start of the sequence - if not(trackback_to_start_flag) and start_of_first_base_block_in_window != 1: - if seq_string_loop == 0: - output_handle.write("FWDCLIP:\t" + record.id + "\t1\t" + str(start_of_first_base_block_in_window - 1) + "\n") - else: - output_handle.write("REVCLIP:\t" + record.id + '\t' + str(len(seq_string) - start_of_first_base_block_in_window + 2) + '\t' + str(len(seq_string)) + "\n") - - break - - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description=__doc__) - parser.add_argument("fasta_file", type=str, help="Path to input FASTA file") - parser.add_argument("output_file", type=str, help="Path for output report file") - parser.add_argument("-v", "--version", action="version", version="1.0") - args = parser.parse_args() - main(args.fasta_file, args.output_file) \ No newline at end of file diff --git a/modules/local/detect_trailing_n.nf b/modules/local/detect_trailing_n.nf deleted file mode 100644 index 0ddf966..0000000 --- a/modules/local/detect_trailing_n.nf +++ /dev/null @@ -1,42 +0,0 @@ -process DETECT_TRAILING_N { - tag "$meta.id" - label 'process_single' - - conda "bioconda::biopython=1.70" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/biopython:1.70--np112py35_1': - 'quay.io/biocontainers/biopython:1.70--np112py35_1' }" - - input: - tuple val(meta), path(fasta_input_file) - - output: - tuple val(meta), path("*_trim_Ns"), emit: trailing_n_report - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def prefix = task.ext.prefix ?: "${meta.id}" - """ - trim_Ns.py ${fasta_input_file} ${prefix}_trim_Ns - cat <<-END_VERSIONS > versions.yml - "${task.process}": - python: \$(python --version | sed 's/Python //g') - trim_Ns.py: \$(trim_Ns.py --version | cut -d' ' -f2) - END_VERSIONS - """ - - stub: - def prefix = task.ext.prefix ?: "${meta.id}" - """ - touch ${prefix}_trim_Ns - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - python: \$(python --version | sed 's/Python //g') - trim_Ns.py: \$(trim_Ns.py --version | cut -d' ' -f2) - END_VERSIONS - """ -} \ No newline at end of file