From ca6e3d6c16f179fde1091766ca88d1c05cf85572 Mon Sep 17 00:00:00 2001 From: eeaunin Date: Thu, 16 Nov 2023 17:02:11 +0000 Subject: [PATCH 01/23] added VecScreen related files --- bin/VSlistTo1HitPerLine.py | 87 +++++++++++ bin/chunk_assembly_for_vecscreen.py | 54 +++++++ bin/summarise_vecscreen_output.py | 53 +++++++ bin/trim_Ns.py | 144 ++++++++++++++++++ conf/modules.config | 4 + modules.json | 5 + modules/local/chunk_assembly_for_vecscreen.nf | 41 +++++ modules/local/detect_trailing_n.nf | 42 +++++ modules/local/filter_vecscreen_results.nf | 41 +++++ modules/local/summarise_vecscreen_output.nf | 41 +++++ modules/nf-core/ncbitools/vecscreen/main.nf | 46 ++++++ modules/nf-core/ncbitools/vecscreen/meta.yml | 50 ++++++ subworkflows/local/run_vecscreen.nf | 34 +++++ 13 files changed, 642 insertions(+) create mode 100755 bin/VSlistTo1HitPerLine.py create mode 100755 bin/chunk_assembly_for_vecscreen.py create mode 100755 bin/summarise_vecscreen_output.py create mode 100755 bin/trim_Ns.py create mode 100644 modules/local/chunk_assembly_for_vecscreen.nf create mode 100644 modules/local/detect_trailing_n.nf create mode 100644 modules/local/filter_vecscreen_results.nf create mode 100644 modules/local/summarise_vecscreen_output.nf create mode 100644 modules/nf-core/ncbitools/vecscreen/main.nf create mode 100644 modules/nf-core/ncbitools/vecscreen/meta.yml create mode 100644 subworkflows/local/run_vecscreen.nf diff --git a/bin/VSlistTo1HitPerLine.py b/bin/VSlistTo1HitPerLine.py new file mode 100755 index 0000000..55b8ca7 --- /dev/null +++ b/bin/VSlistTo1HitPerLine.py @@ -0,0 +1,87 @@ +#!/usr/bin/env python3 +""" +This script is based on an AWK script from NCBI: https://ftp.ncbi.nlm.nih.gov/pub/kitts/VSlistTo1HitPerLine.awk, orginally by Paul Kitts. +Converted to Python by Eerik Aunin. + +This script converts the VecScreen text list output to one line giving the coordinates for each vector segment in the format: +VecScreen_Category ID_string start_position end_position +The default is to report Strong, Moderate, and Weak matches and also segments of Suspect Origin. Reporting of any category can be suppressed by including +--skip_reporting_suspect_hits, --skip_reporting_weak_hits, --skip_reporting_moderate_hits or --skip_reporting_strong_hits on the command line. +"No hits" will be reported for any Query sequence that had no matches in any of the selected categories, unless --skip_reporting_no_hits is included on the command line. +VecScreen errors will be reported unless --skip_reporting_errors is included on the command line. +Usage: +VSlistTo1HitPerLine.py [options] vecscreen_output_file +""" + +import re +import argparse + +def main(args): + hits_to_report = "" + ID = "" + hits = 0 + error_found = 0 + + with open(args.vecscreen_output_file, "r") as file: + for line in file: + line = line.strip() + Fld = line.split(" ") + + if not re.match(r'^[0-9 \t]+$', line): + hits_to_report = "" + + if hits_to_report: + print(f"VecScreen_{hits_to_report.ljust(8)}\t{ID}\t{line}") + hits += 1 + continue + + if line.startswith(">Vector "): + if ID != "": + if error_found: + print(f"VecScreen_ERROR \t{ID}") + elif hits == 0 and args.skip_reporting_no_hits is False: + print(f"VecScreen_No_Hits \t{ID}") + ID = Fld[1] + hits = 0 + error_found = 0 + continue + + if args.skip_reporting_strong_hits is False and line.startswith("Strong"): + hits_to_report = "Strong" + continue + + if args.skip_reporting_moderate_hits is False and line.startswith("Moderate"): + hits_to_report = "Moderate" + continue + + if args.skip_reporting_weak_hits is False and line.startswith("Weak"): + hits_to_report = "Weak" + continue + + if args.skip_reporting_suspect_hits is False and line.startswith("Suspect"): + hits_to_report = "Suspect" + continue + + if args.skip_reporting_errors is False: + if line.startswith("ERROR") or line.startswith("WARNING"): + error_found += 1 + + if ID != "": + if error_found > 0: + print(f"VecScreen_ERROR \t{ID}") + elif hits == 0 and args.skip_reporting_no_hits is False: + print(f"VecScreen_No_Hits \t{ID}") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Reformatting VecScreen's output") + parser.add_argument("vecscreen_output_file", type=str, help="Path to a raw output file from NCBI VecScreen (from a run with the -f3 flag)", default=None) + parser.add_argument("--skip_reporting_strong_hits", action="store_true", help="Skip reporting strong hits") + parser.add_argument("--skip_reporting_moderate_hits", action="store_true", help="Skip reporting moderate hits") + parser.add_argument("--skip_reporting_weak_hits", action="store_true", help="Skip reporting weak hits") + parser.add_argument("--skip_reporting_suspect_hits", action="store_true", help="Skip reporting hits of suspect origin") + parser.add_argument("--skip_reporting_no_hits", action="store_true", help="Skip reporting no-hits") + parser.add_argument("--skip_reporting_errors", action="store_true", help="Skip reporting errors") + parser.add_argument("-v", "--version", action="version", version="1.0") + args = parser.parse_args() + main(args) \ No newline at end of file diff --git a/bin/chunk_assembly_for_vecscreen.py b/bin/chunk_assembly_for_vecscreen.py new file mode 100755 index 0000000..5721745 --- /dev/null +++ b/bin/chunk_assembly_for_vecscreen.py @@ -0,0 +1,54 @@ +#!/usr/bin/env python3 +""" +Script for chunking an assembly before running NCBI VecScreen. Adapted from a script by James Torrance, edited by Eerik Aunin +""" + +from Bio import SeqIO +import argparse +import os + +def main(fasta_input_file, fasta_output_file): + fasta_input_file = os.path.abspath(fasta_input_file) + fasta_output_file = os.path.abspath(fasta_output_file) + + threshold_length = 500000 + overlap_length = int( threshold_length / 10 ) + minimum_record_size = 11 + + fasta_output_handle = open(fasta_output_file, 'w') + + with open(fasta_input_file, 'r') as fasta_input_handle: + for record in SeqIO.parse(fasta_input_handle, "fasta"): + + if len(record) >= minimum_record_size: + records_to_write = [] + + slice_count = 0 + while (slice_count * threshold_length) < len(record) - (threshold_length+overlap_length): + record_slice = record[(slice_count*threshold_length):((slice_count+1)*threshold_length + overlap_length)] + record_slice.id += '.chunk_' + str(slice_count+1) + + record_slice.description = '' + records_to_write.append(record_slice) + slice_count += 1 + + final_record_slice = record[(slice_count*threshold_length):] + + if slice_count > 0: + final_record_slice.id += '.chunk_' + str(slice_count+1) + final_record_slice.description = '' + + records_to_write.append(final_record_slice) + + SeqIO.write(records_to_write, fasta_output_handle, 'fasta') + + fasta_output_handle.close() + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("fasta_input_file", type=str, help="Path to input FASTA file") + parser.add_argument("fasta_output_file", type=str, help="Path for FASTA output file") + parser.add_argument("-v", "--version", action="version", version="1.0") + args = parser.parse_args() + main(args.fasta_input_file, args.fasta_output_file) diff --git a/bin/summarise_vecscreen_output.py b/bin/summarise_vecscreen_output.py new file mode 100755 index 0000000..a513f53 --- /dev/null +++ b/bin/summarise_vecscreen_output.py @@ -0,0 +1,53 @@ +#!/usr/bin/env python3 +""" +Script for reformatting output from VecScreen that has been processed using the VSlistTo1HitPerLine.awk script. +This script removes no-hit entries from the report file and converts coordinates in assembly chunks to coordinates in the full assembly. +The original script was written in Perl by James Torrance. Rewritten by Eerik Aunin. +""" + +import argparse +import os +import sys + + +def main(vecscreen_file, chunk_size): + if os.path.isfile(vecscreen_file) is False: + sys.stderr.write(f"The input file for this script ({vecscreen_file}) was not found\n") + sys.exit(1) + + with open(vecscreen_file) as f: + vecscreen_data = f.readlines() + + output_lines = list() + + for line in vecscreen_data: + if line.startswith("VecScreen_No_Hits") is True: + continue + split_line = line.split() + assert len(split_line) == 4 + vecscreen_result, seq_name, seq_start, seq_end = split_line + seq_start = int(seq_start) + seq_end = int(seq_end) + seq_base_name = seq_name + if ".chunk_" in seq_name: + split_seq_name = seq_name.split(".chunk_") + assert len(split_seq_name) == 2 + seq_base_name = split_seq_name[0] + chunk_suffix = int(split_seq_name[1]) + seq_start += chunk_size * (chunk_suffix - 1) + seq_end += chunk_size * (chunk_suffix - 1) + corrected_line = "\t".join((vecscreen_result, seq_base_name, str(seq_start), str(seq_end))) + if corrected_line not in output_lines: + output_lines.append(corrected_line) + + for line in output_lines: + print(line) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("vecscreen_file", type=str, help="Path to output file of VecScreen (run with -f3 flag), filtered with VSlistTo1HitPerLine.awk") + parser.add_argument("--chunk_size", type=int, help="Chunk size of the chunked FASTA file that VecScreen was run with, in bp. Default: 500000", default=50000) + parser.add_argument("-v", "--version", action="version", version="1.0") + args = parser.parse_args() + main(args.vecscreen_file, args.chunk_size) diff --git a/bin/trim_Ns.py b/bin/trim_Ns.py new file mode 100755 index 0000000..3f20eb6 --- /dev/null +++ b/bin/trim_Ns.py @@ -0,0 +1,144 @@ +#!/usr/bin/env python3 +""" +Script for detecting trailing Ns that should be trimmed from an assembly, from James Torrance. Edited by Eerik Aunin +""" + +import re +from Bio import SeqIO +import argparse + +def main(fasta_file, output_file): + + minleftover = 200 # after trimming start/end, at least this many bp should be left + winsize = 5000 # for sliding window analysis + minslidingBase = 0.4 # maximum fraction of Ns in sliding window before alarm sets off + + output_handle = open(output_file, 'w') + with open(fasta_file, "r") as fasta_input_handle: + for record in SeqIO.parse(fasta_input_handle, "fasta"): + # output_handle.write('Testing ' + record.id + "\n") + + # Do this twice: once with the regular string, once with the reversed string + # The output should be as one or more variables that can be reversed for the second iteration + seq_string = str(record.seq) + + n_count = seq_string.count('N') + seq_string.count('n') + n_perc = n_count/ len(seq_string) + if n_perc > 0.8: + output_handle.write('# WARNING: ' + record.id + '\t' + str(int(n_perc * 10000)/100) + ' % Ns of total ' + str(len(seq_string)) + "\n") + + # Consider handling this via two separate regexps for start and end. Adding character-class might be escalating run-time + #terminal_n_match = re.match('^([Nn]*)(\S+?)([Nn]*)$', seq_string) + start_n_match = re.match('^([Nn]*)', seq_string) + end_n_match = re.search('^([Nn]*)', seq_string[::-1]) + + startseq = '' + if start_n_match: + startseq = start_n_match.group(1) + endseq = '' + if end_n_match: + endseq = end_n_match.group(1) + realseq_length = len(seq_string) - len(startseq) - len(endseq) + # Handle "all Ns exception" + if len(startseq) == len(seq_string): + realseq_length = 0 + endseq = '' + + if len(startseq) > 0 or len(endseq) > 0: + if len(startseq) > 0: + output_handle.write('\t'.join(["TRIM:",record.id,'1',str(len(startseq))]) + "\t\n") + if len(endseq) > 0: + output_handle.write('\t'.join(["TRIM:",record.id,str(len(startseq)+realseq_length+1),str(len(seq_string))]) + "\t\n") + + if realseq_length <= minleftover: + output_handle.write("REMOVE: " + record.id + '\t' + str(realseq_length) + " bp leftover after trimming\n") + + + # Only attempt the windowing approach if there's an initial window that doesn't meet the trigger condition + for seq_string_loop, seq_string_for_window in enumerate([seq_string, seq_string[::-1]]): + if len(seq_string_for_window) > winsize and (seq_string_for_window[:winsize].count('N') + seq_string_for_window[:winsize].count('n')) > winsize * minslidingBase: + + non_n_regions = [] + non_n_iterator = re.finditer('[^Nn]+', seq_string_for_window) + for non_n_instance in non_n_iterator: + # output_handle.write(record.id + '\t' + str(non_n_instance.start(0)) + '\t' + str(non_n_instance.end(0)) + '\n') + + current_non_n_start = non_n_instance.start(0) + 1 + current_non_n_end = non_n_instance.end(0) + + non_n_regions.insert(0, [current_non_n_start, current_non_n_end]) + + # Does the *end* of this block satisfy the window condition? + bases_in_window = 0 + start_of_first_base_block_in_window = None + for non_n_region in non_n_regions: + if non_n_region[1] >= current_non_n_end - winsize: + start_of_first_base_block_in_window = non_n_region[0] + if non_n_region[0] >= current_non_n_end - winsize: + bases_in_window += non_n_region[1] - non_n_region[0] + 1 + else: + bases_in_window += non_n_region[1] - non_n_region[0] + 1 + break + else: + break + + if bases_in_window >= minslidingBase * winsize: + # Because start positions have BED-style + + # Remember that the blocks are in *reverse* order along the sequence + trackback_to_start_flag = False + + for i, non_n_region in enumerate(non_n_regions): + if i == 0: + continue + bases_in_window_2 = 0 + if non_n_region[1] < non_n_regions[0][0] - winsize: + break + else: + current_window_start = max(non_n_region[0], non_n_regions[0][0] - winsize) + + # Count the bases from this block + bases_in_window_2 += min(non_n_region[1], (current_window_start+winsize-1) ) - current_window_start + 1 + + # Add up all the sequence in blocks tested thus far, but not the final block + for j in range(1, i): + bases_in_window_2 += non_n_regions[j][1] - non_n_regions[j][0] + 1 + + # Count the sequence in the final block that can contribute + # This is the region between the start of the final block and either the end of the block + # or the end of a window extending from the current test start point, whichever comes first + bases_in_window_2 += min( non_n_regions[0][1], (current_window_start + winsize-1) ) - non_n_regions[0][0] + 1 + + # output_handle.write('BIW: ' + str(i) + ' ' + str(bases_in_window_2) + "\n") + if bases_in_window_2 >= minslidingBase * winsize: + if current_window_start == non_n_region[0]: + start_of_first_base_block_in_window = current_window_start + else: + start_of_first_base_block_in_window = non_n_regions[i-1][0] + else: + # We keep going back until the threshold condition is *not* met + break + + # If we find the break-point should be before the first block, then we don't want to trim at all! + if i == len(non_n_regions) - 1: + trackback_to_start_flag = True + + + # Only trim if the breakpoint isn't before the first block + # and if the breakpoint isn't at the start of the sequence + if not(trackback_to_start_flag) and start_of_first_base_block_in_window != 1: + if seq_string_loop == 0: + output_handle.write("FWDCLIP:\t" + record.id + "\t1\t" + str(start_of_first_base_block_in_window - 1) + "\n") + else: + output_handle.write("REVCLIP:\t" + record.id + '\t' + str(len(seq_string) - start_of_first_base_block_in_window + 2) + '\t' + str(len(seq_string)) + "\n") + + break + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("fasta_file", type=str, help="Path to input FASTA file") + parser.add_argument("output_file", type=str, help="Path for output report file") + parser.add_argument("-v", "--version", action="version", version="1.0") + args = parser.parse_args() + main(args.fasta_file, args.output_file) \ No newline at end of file diff --git a/conf/modules.config b/conf/modules.config index 900ec4a..2febc01 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -95,4 +95,8 @@ process { ext.prefix = { "${meta.id}_alignment_${reference.getName().tokenize('.')[0]}" } } + withName: NCBITOOLS_VECSCREEN { + ext.args = { "-f3" } + } + } diff --git a/modules.json b/modules.json index 6a8a74a..f8c9ac1 100644 --- a/modules.json +++ b/modules.json @@ -73,6 +73,11 @@ "git_sha": "8fc1d24c710ebe1d5de0f2447ec9439fd3d9d66a", "installed_by": ["modules"] }, + "ncbitools/vecscreen": { + "branch": "master", + "git_sha": "1e4ac4aa2c612f9547f79f02ef7c651ccc9f657b", + "installed_by": ["modules"] + }, "samtools/depth": { "branch": "master", "git_sha": "8fc1d24c710ebe1d5de0f2447ec9439fd3d9d66a", diff --git a/modules/local/chunk_assembly_for_vecscreen.nf b/modules/local/chunk_assembly_for_vecscreen.nf new file mode 100644 index 0000000..727e04b --- /dev/null +++ b/modules/local/chunk_assembly_for_vecscreen.nf @@ -0,0 +1,41 @@ +process CHUNK_ASSEMBLY_FOR_VECSCREEN { + tag "$meta.id" + label 'process_single' + + conda "bioconda::biopython=1.70" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/biopython:1.70--np112py35_1': + 'quay.io/biocontainers/biopython:1.70--np112py35_1' }" + + input: + tuple val(meta), path(fasta_input_file) + + output: + tuple val(meta), path("chunked_assembly.fa"), emit: chunked_assembly + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + """ + chunk_assembly_for_vecscreen.py $fasta_input_file chunked_assembly.fa + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | sed 's/Python //g') + chunk_assembly_for_vecscreen.py: \$(chunk_assembly_for_vecscreen.py --version | cut -d' ' -f2) + END_VERSIONS + """ + + stub: + """ + touch chunked_assembly.fa + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | sed 's/Python //g') + chunk_assembly_for_vecscreen.py: \$(chunk_assembly_for_vecscreen.py --version | cut -d' ' -f2) + END_VERSIONS + """ +} diff --git a/modules/local/detect_trailing_n.nf b/modules/local/detect_trailing_n.nf new file mode 100644 index 0000000..0ddf966 --- /dev/null +++ b/modules/local/detect_trailing_n.nf @@ -0,0 +1,42 @@ +process DETECT_TRAILING_N { + tag "$meta.id" + label 'process_single' + + conda "bioconda::biopython=1.70" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/biopython:1.70--np112py35_1': + 'quay.io/biocontainers/biopython:1.70--np112py35_1' }" + + input: + tuple val(meta), path(fasta_input_file) + + output: + tuple val(meta), path("*_trim_Ns"), emit: trailing_n_report + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + trim_Ns.py ${fasta_input_file} ${prefix}_trim_Ns + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | sed 's/Python //g') + trim_Ns.py: \$(trim_Ns.py --version | cut -d' ' -f2) + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}_trim_Ns + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | sed 's/Python //g') + trim_Ns.py: \$(trim_Ns.py --version | cut -d' ' -f2) + END_VERSIONS + """ +} \ No newline at end of file diff --git a/modules/local/filter_vecscreen_results.nf b/modules/local/filter_vecscreen_results.nf new file mode 100644 index 0000000..38c701e --- /dev/null +++ b/modules/local/filter_vecscreen_results.nf @@ -0,0 +1,41 @@ +process FILTER_VECSCREEN_RESULTS { + tag "$meta.id" + label 'process_single' + + conda "conda-forge::python=3.9" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/python:3.9' : + 'biocontainers/python:3.9' }" + + input: + tuple val(meta), path(vecscreen_outfile) + + output: + tuple val(meta), path("vecscreen.grepped.out"), emit: filtered_vecscreen_outfile + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + """ + VSlistTo1HitPerLine.py --skip_reporting_suspect_hits --skip_reporting_weak_hits --skip_reporting_no_hits ${vecscreen_outfile} > vecscreen.grepped.out + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | sed 's/Python //g') + VSlistTo1HitPerLine: \$(VSlistTo1HitPerLine.py -v) + END_VERSIONS + """ + + stub: + """ + touch vecscreen.grepped.out + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | sed 's/Python //g') + VSlistTo1HitPerLine: \$(VSlistTo1HitPerLine.py -v) + END_VERSIONS + """ +} diff --git a/modules/local/summarise_vecscreen_output.nf b/modules/local/summarise_vecscreen_output.nf new file mode 100644 index 0000000..3154c3e --- /dev/null +++ b/modules/local/summarise_vecscreen_output.nf @@ -0,0 +1,41 @@ +process SUMMARISE_VECSCREEN_OUTPUT { + tag "$meta.id" + label 'process_single' + + conda "conda-forge::python=3.9" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/python:3.9' : + 'biocontainers/python:3.9' }" + + input: + tuple val(meta), path(vecscreen_filtered_outfile) + + output: + tuple val(meta), path("*.vecscreen_contamination"), emit: vecscreen_contamination + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + summarise_vecscreen_output.py ${vecscreen_filtered_outfile} > ${prefix}.vecscreen_contamination + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | sed 's/Python //g') + summarise_vecscreen_output.py: \$(summarise_vecscreen_output.py --version | cut -d' ' -f2) + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.vecscreen_contamination + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | sed 's/Python //g') + summarise_vecscreen_output.py: \$(summarise_vecscreen_output.py --version | cut -d' ' -f2) + END_VERSIONS + """ +} diff --git a/modules/nf-core/ncbitools/vecscreen/main.nf b/modules/nf-core/ncbitools/vecscreen/main.nf new file mode 100644 index 0000000..a277eca --- /dev/null +++ b/modules/nf-core/ncbitools/vecscreen/main.nf @@ -0,0 +1,46 @@ +process NCBITOOLS_VECSCREEN { + tag "$meta.id" + label 'process_single' + + container "docker.io/biocontainers/ncbi-tools-bin:6.1.20170106-6-deb_cv2" + + input: + tuple val(meta), path(fasta_file) + tuple val(adapters_database_meta), path(adapters_database_directory) + + output: + tuple val(meta), path("${meta.id}.vecscreen.out") , emit: vecscreen_output + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + // Exit if running this module with -profile conda / -profile mamba + if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { + error "The VecScreen module does not support Conda. Please use Docker / Singularity / Podman instead." + } + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + // WARN: VecScreen doesn't output a version number and doesn't appear to have a Github repository. 1.0 is arbitrarily used here as the version number + """ + DB=`find -L ${adapters_database_directory} -maxdepth 1 -name "*.nin" | sed 's/\\.nin\$//'` + vecscreen -d \$DB ${args} -i ${fasta_file} -o ${prefix}.vecscreen.out + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + vecscreen: 1.0 + END_VERSIONS + """ + + stub: + // WARN: VecScreen doesn't output a version number and doesn't appear to have a Github repository. 1.0 is arbitrarily used here as the version number + """ + touch ${prefix}.vecscreen.out + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + vecscreen: 1.0 + END_VERSIONS + """ +} diff --git a/modules/nf-core/ncbitools/vecscreen/meta.yml b/modules/nf-core/ncbitools/vecscreen/meta.yml new file mode 100644 index 0000000..45f0f7e --- /dev/null +++ b/modules/nf-core/ncbitools/vecscreen/meta.yml @@ -0,0 +1,50 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/yaml-schema.json +name: "NCBITOOLS_VECSCREEN" +description: NCBI tool for detecting vector contamination in nucleic acid sequences. This tool is older than NCBI's FCS-adaptor, which is for the same purpose +keywords: + - assembly + - genomics + - quality control + - contamination + - vector + - NCBI +tools: + - "ncbitools": + description: | + "NCBI libraries for biology applications (text-based utilities)" + homepage: "https://www.ncbi.nlm.nih.gov/tools/vecscreen/" + documentation: "https://www.ncbi.nlm.nih.gov/tools/vecscreen/interpretation/" + tool_dev_url: "https://www.ncbi.nlm.nih.gov/tools/vecscreen/" + licence: ["The Open Database License"] + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', taxid:'6973' ] + - fasta_file: + type: file + description: FASTA file that will be screened for contaminants + - adapters_database_file: + type: file + description: Path to a nucleotide BLAST database file with vector sequences + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', taxid:'9606' ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - vecscreen_output: + type: file + description: VecScreen report file. This can be in different formats depending on the value of the optional -f parameter. 0 = HTML format, with alignments. 1 = HTML format, no alignments. 2 = Text list, with alignments. 3 = Text list, no alignments. default = 0 + pattern: "*.vecscreen.out" + +authors: + - "@eeaunin" diff --git a/subworkflows/local/run_vecscreen.nf b/subworkflows/local/run_vecscreen.nf new file mode 100644 index 0000000..645fe82 --- /dev/null +++ b/subworkflows/local/run_vecscreen.nf @@ -0,0 +1,34 @@ +#!/usr/bin/env nextflow +// MODULE IMPORT BLOCK + +include { CHUNK_ASSEMBLY_FOR_VECSCREEN } from '../../modules/local/chunk_assembly_for_vecscreen' +include { NCBITOOLS_VECSCREEN } from '../../modules/nf-core/ncbitools/vecscreen/main' +include { FILTER_VECSCREEN_RESULTS } from '../../modules/local/filter_vecscreen_results' +include { SUMMARISE_VECSCREEN_OUTPUT } from '../../modules/local/summarise_vecscreen_output' + + +workflow RUN_VECSCREEN { + take: + reference_tuple // val(meta), path(fasta) + adapters_blastn_database_directory + + main: + ch_versions = Channel.empty() + // MODULE: CHUNKS THE ASSEMBLY INTO PIECES WITH FIXED LENGTH + CHUNK_ASSEMBLY_FOR_VECSCREEN(reference_tuple) + ch_versions = ch_versions.mix(CHUNK_ASSEMBLY_FOR_VECSCREEN.out.versions) + // MODULE: RUNS NCBI VECSCREEN + NCBITOOLS_VECSCREEN(CHUNK_ASSEMBLY_FOR_VECSCREEN.out.chunked_assembly, [ [id:'vecscreen_adapters_database'], adapters_blastn_database_directory ]) + ch_versions = ch_versions.mix(NCBITOOLS_VECSCREEN.out.versions) + // MODULE: REFORMATS VECSCREEN OUTPUT AND FILTERS IT TO REMOVE NO-HIT RESULTS AND KEEP ONLY HITS + FILTER_VECSCREEN_RESULTS(NCBITOOLS_VECSCREEN.out.vecscreen_output) + ch_versions = ch_versions.mix(FILTER_VECSCREEN_RESULTS.out.versions) + // MODULE: CONVERTS COORDINATES IN ASSEMBLY CHUNKS BACK TO COORDINATES IN THE WHOLE ASSEMBLY AND WRITES A REPORT FILE + SUMMARISE_VECSCREEN_OUTPUT(FILTER_VECSCREEN_RESULTS.out.filtered_vecscreen_outfile) + ch_versions = ch_versions.mix(SUMMARISE_VECSCREEN_OUTPUT.out.versions) + + emit: + vecscreen_contamination = SUMMARISE_VECSCREEN_OUTPUT.out.vecscreen_contamination + versions = ch_versions.ifEmpty(null) // channel: [ versions.yml ] +} + From 568e9635a39aee148527e720d6a4f7bc7a1581f4 Mon Sep 17 00:00:00 2001 From: eeaunin Date: Thu, 16 Nov 2023 17:09:53 +0000 Subject: [PATCH 02/23] Edited the main workflow and test yaml --- assets/test.yaml | 2 +- workflows/ascc.nf | 28 +++++ workflows/ascc_full.nf | 246 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 275 insertions(+), 1 deletion(-) create mode 100644 workflows/ascc_full.nf diff --git a/assets/test.yaml b/assets/test.yaml index 73d0697..d418a26 100755 --- a/assets/test.yaml +++ b/assets/test.yaml @@ -17,7 +17,7 @@ ncbi_taxonomy_path: /lustre/scratch123/tol/teams/tola/users/ea10/databases/taxdu ncbi_rankedlineage_path: /lustre/scratch123/tol/teams/tola/users/ea10/databases/taxdump/rankedlineage.dmp busco_lineages_folder: /lustre/scratch123/tol/resources/busco/data/v5/2021-08-27/lineages fcs_gx_database_path: /lustre/scratch124/tol/projects/asg/sub_projects/ncbi_decon/0.4.0/gxdb -vecscreen_database_path: /lustre/scratch123/tol/teams/tola/users/ea10/ascc_databases/vecscreen_database/adaptors_for_screening_euks.fa +vecscreen_database_path: /lustre/scratch123/tol/teams/tola/users/ea10/ascc_databases/vecscreen_database diamond_uniprot_database_path: /lustre/scratch123/tol/teams/tola/users/ea10/ascc_databases/uniprot/uniprot_reference_proteomes_with_taxonnames.dmnd diamond_nr_database_path: /lustre/scratch123/tol/resources/nr/latest/nr.dmnd seqkit: diff --git a/workflows/ascc.nf b/workflows/ascc.nf index 220606f..8a98914 100644 --- a/workflows/ascc.nf +++ b/workflows/ascc.nf @@ -34,10 +34,12 @@ include { RUN_NT_KRAKEN } from '../subworkflows/local/run_nt_kra include { RUN_FCSGX } from '../subworkflows/local/run_fcsgx' include { PACBIO_BARCODE_CHECK } from '../subworkflows/local/pacbio_barcode_check' include { RUN_READ_COVERAGE } from '../subworkflows/local/run_read_coverage' +include { RUN_VECSCREEN } from '../subworkflows/local/run_vecscreen' include { ORGANELLAR_BLAST as PLASTID_ORGANELLAR_BLAST } from '../subworkflows/local/organellar_blast' include { ORGANELLAR_BLAST as MITO_ORGANELLAR_BLAST } from '../subworkflows/local/organellar_blast' + // // MODULE: Local modules // @@ -95,10 +97,13 @@ workflow ASCC { // // SUBWORKFLOW: EXTRACT RESULTS HITS FROM TIARA // + + /* EXTRACT_TIARA_HITS ( GENERATE_GENOME.out.reference_tuple ) ch_versions = ch_versions.mix(EXTRACT_TIARA_HITS.out.versions) + */ // // LOGIC: INJECT SLIDING WINDOW VALUES INTO REFERENCE @@ -129,54 +134,66 @@ workflow ASCC { // // LOGIC: CHECK WHETHER THERE IS A MITO AND BRANCH // + + /* YAML_INPUT.out.mito_tuple .branch { meta, check -> valid: check != "NO MITO" invalid: check == "NO MITO" } .set { mito_check } + */ // // SUBWORKFLOW: BLASTING FOR MITO ASSEMBLIES IN GENOME // + /* MITO_ORGANELLAR_BLAST ( YAML_INPUT.out.reference_tuple, YAML_INPUT.out.mito_var, mito_check.valid ) ch_versions = ch_versions.mix(MITO_ORGANELLAR_BLAST.out.versions) + */ // // LOGIC: CHECK WHETHER THERE IS A PLASTID AND BRANCH // + /* YAML_INPUT.out.plastid_tuple .branch { meta, check -> valid: check != "NO PLASTID" invalid: check == "NO PLASTID" } .set { plastid_check } + */ // // SUBWORKFLOW: BLASTING FOR PLASTID ASSEMBLIES IN GENOME // + /* PLASTID_ORGANELLAR_BLAST ( YAML_INPUT.out.reference_tuple, YAML_INPUT.out.plastid_var, plastid_check.valid ) ch_versions = ch_versions.mix(PLASTID_ORGANELLAR_BLAST.out.versions) + */ // // SUBWORKFLOW: // + /* RUN_FCSADAPTOR ( YAML_INPUT.out.reference_tuple ) ch_versions = ch_versions.mix(RUN_FCSADAPTOR.out.versions) + */ // // SUBWORKFLOW: // + /* RUN_FCSGX ( YAML_INPUT.out.reference_tuple, YAML_INPUT.out.fcs_gx_database_path, @@ -184,6 +201,7 @@ workflow ASCC { YAML_INPUT.out.ncbi_rankedlineage_path ) ch_versions = ch_versions.mix(RUN_FCSADAPTOR.out.versions) + */ // // SUBWORKFLOW: IDENTITY PACBIO BARCODES IN INPUT DATA @@ -199,6 +217,7 @@ workflow ASCC { // // SUBWORKFLOW: CALCULATE AVERAGE READ COVERAGE // + /* RUN_READ_COVERAGE ( YAML_INPUT.out.reference_tuple, YAML_INPUT.out.assembly_path, @@ -206,10 +225,19 @@ workflow ASCC { YAML_INPUT.out.reads_type ) ch_versions = ch_versions.mix(RUN_READ_COVERAGE.out.versions) + */ // // SUBWORKFLOW: COLLECT SOFTWARE VERSIONS // + + RUN_VECSCREEN ( + GENERATE_GENOME.out.reference_tuple, + "/lustre/scratch123/tol/teams/tola/users/ea10/ascc_databases/vecscreen_database" + //YAML_INPUT.out.vecscreen_database_path + ) + ch_versions = ch_versions.mix(RUN_VECSCREEN.out.versions) + CUSTOM_DUMPSOFTWAREVERSIONS ( ch_versions.unique().collectFile(name: 'collated_versions.yml') ) diff --git a/workflows/ascc_full.nf b/workflows/ascc_full.nf new file mode 100644 index 0000000..3107122 --- /dev/null +++ b/workflows/ascc_full.nf @@ -0,0 +1,246 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + PRINT PARAMS SUMMARY +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +include { paramsSummaryLog; paramsSummaryMap } from 'plugin/nf-validation' + +def logo = NfcoreTemplate.logo(workflow, params.monochrome_logs) +def citation = '\n' + WorkflowMain.citation(workflow) + '\n' +def summary_params = paramsSummaryMap(workflow) + +// Print parameter summary log to screen +log.info logo + paramsSummaryLog(workflow) + citation + +WorkflowAscc.initialise(params, log) + +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + IMPORT LOCAL MODULES/SUBWORKFLOWS +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +// +// SUBWORKFLOW: Consisting of a mix of local and nf-core/modules +// + +include { YAML_INPUT } from '../subworkflows/local/yaml_input' +include { GENERATE_GENOME } from '../subworkflows/local/generate_genome' +include { EXTRACT_TIARA_HITS } from '../subworkflows/local/extract_tiara_hits' +include { EXTRACT_NT_BLAST } from '../subworkflows/local/extract_nt_blast' +include { RUN_FCSADAPTOR } from '../subworkflows/local/run_fcsadaptor' +include { RUN_NT_KRAKEN } from '../subworkflows/local/run_nt_kraken' +include { RUN_FCSGX } from '../subworkflows/local/run_fcsgx' +include { PACBIO_BARCODE_CHECK } from '../subworkflows/local/pacbio_barcode_check' +include { RUN_READ_COVERAGE } from '../subworkflows/local/run_read_coverage' +include { RUN_VECSCREEN } from '../subworkflows/local/run_vecscreen' +include { ORGANELLAR_BLAST as PLASTID_ORGANELLAR_BLAST } from '../subworkflows/local/organellar_blast' +include { ORGANELLAR_BLAST as MITO_ORGANELLAR_BLAST } from '../subworkflows/local/organellar_blast' + + + +// +// MODULE: Local modules +// +include { GC_CONTENT } from '../modules/local/gc_content' + +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + IMPORT NF-CORE MODULES/SUBWORKFLOWS +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +// +// MODULE: Installed directly from nf-core/modules +// +include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/custom/dumpsoftwareversions/main' + +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + RUN MAIN WORKFLOW +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +workflow ASCC { + + main: + ch_versions = Channel.empty() + + input_ch = Channel.fromPath(params.input, checkIfExists: true) + + // + // SUBWORKFLOW: DECODE YAML INTO PARAMETERS FOR PIPELINE + // + YAML_INPUT ( + input_ch + ) + ch_versions = ch_versions.mix(YAML_INPUT.out.versions) + + // + // MODULE: CALCULATE GC CONTENT PER SCAFFOLD IN INPUT FASTA + // + GC_CONTENT ( + YAML_INPUT.out.reference_tuple + ) + ch_versions = ch_versions.mix(GC_CONTENT.out.versions) + + // + // SUBWORKFLOW: GENERATE GENOME FILE + // + GENERATE_GENOME ( + YAML_INPUT.out.reference_tuple, + YAML_INPUT.out.pacbio_barcodes + ) + ch_versions = ch_versions.mix(GENERATE_GENOME.out.versions) + + // + // SUBWORKFLOW: EXTRACT RESULTS HITS FROM TIARA + // + EXTRACT_TIARA_HITS ( + GENERATE_GENOME.out.reference_tuple + ) + ch_versions = ch_versions.mix(EXTRACT_TIARA_HITS.out.versions) + + // + // LOGIC: INJECT SLIDING WINDOW VALUES INTO REFERENCE + // + YAML_INPUT.out.reference_tuple + .combine ( YAML_INPUT.out.seqkit_sliding.toInteger() ) + .combine ( YAML_INPUT.out.seqkit_window.toInteger() ) + .map { meta, ref, sliding, window -> + tuple([ id : meta.id, + sliding : sliding, + window : window + ], + file(ref) + )} + .set { modified_input } + + // + // SUBWORKFLOW: EXTRACT RESULTS HITS FROM NT-BLAST + // +/* EXTRACT_NT_BLAST ( + modified_input, + YAML_INPUT.out.nt_database, + YAML_INPUT.out.ncbi_accessions, + YAML_INPUT.out.ncbi_rankedlineage_path + ) + ch_versions = ch_versions.mix(EXTRACT_NT_BLAST.out.versions) */ + + // + // LOGIC: CHECK WHETHER THERE IS A MITO AND BRANCH + // + YAML_INPUT.out.mito_tuple + .branch { meta, check -> + valid: check != "NO MITO" + invalid: check == "NO MITO" + } + .set { mito_check } + + // + // SUBWORKFLOW: BLASTING FOR MITO ASSEMBLIES IN GENOME + // + MITO_ORGANELLAR_BLAST ( + YAML_INPUT.out.reference_tuple, + YAML_INPUT.out.mito_var, + mito_check.valid + ) + ch_versions = ch_versions.mix(MITO_ORGANELLAR_BLAST.out.versions) + + // + // LOGIC: CHECK WHETHER THERE IS A PLASTID AND BRANCH + // + YAML_INPUT.out.plastid_tuple + .branch { meta, check -> + valid: check != "NO PLASTID" + invalid: check == "NO PLASTID" + } + .set { plastid_check } + + // + // SUBWORKFLOW: BLASTING FOR PLASTID ASSEMBLIES IN GENOME + // + PLASTID_ORGANELLAR_BLAST ( + YAML_INPUT.out.reference_tuple, + YAML_INPUT.out.plastid_var, + plastid_check.valid + ) + ch_versions = ch_versions.mix(PLASTID_ORGANELLAR_BLAST.out.versions) + + // + // SUBWORKFLOW: + // + RUN_FCSADAPTOR ( + YAML_INPUT.out.reference_tuple + ) + ch_versions = ch_versions.mix(RUN_FCSADAPTOR.out.versions) + + // + // SUBWORKFLOW: + // + RUN_FCSGX ( + YAML_INPUT.out.reference_tuple, + YAML_INPUT.out.fcs_gx_database_path, + YAML_INPUT.out.taxid, + YAML_INPUT.out.ncbi_rankedlineage_path + ) + ch_versions = ch_versions.mix(RUN_FCSADAPTOR.out.versions) + + // + // SUBWORKFLOW: IDENTITY PACBIO BARCODES IN INPUT DATA + // + /*PACBIO_BARCODE_CHECK ( + YAML_INPUT.out.reference_tuple, + YAML_INPUT.out.pacbio_tuple, + YAML_INPUT.out.pacbio_barcodes, + YAML_INPUT.out.pacbio_multiplex_codes + ) + ch_versions = ch_versions.mix(PACBIO_BARCODE_CHECK.out.versions)*/ + + // + // SUBWORKFLOW: CALCULATE AVERAGE READ COVERAGE + // + RUN_READ_COVERAGE ( + YAML_INPUT.out.reference_tuple, + YAML_INPUT.out.assembly_path, + YAML_INPUT.out.pacbio_tuple, + YAML_INPUT.out.reads_type + ) + ch_versions = ch_versions.mix(RUN_READ_COVERAGE.out.versions) + + // + // SUBWORKFLOW: COLLECT SOFTWARE VERSIONS + // + CUSTOM_DUMPSOFTWAREVERSIONS ( + ch_versions.unique().collectFile(name: 'collated_versions.yml') + ) + + emit: + software_ch = CUSTOM_DUMPSOFTWAREVERSIONS.out.yml + versions_ch = CUSTOM_DUMPSOFTWAREVERSIONS.out.versions +} + +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + COMPLETION EMAIL AND SUMMARY +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +workflow.onComplete { + if (params.email || params.email_on_fail) { + NfcoreTemplate.email(workflow, params, summary_params, projectDir, log) + } + NfcoreTemplate.summary(workflow, params, log) + if (params.hook_url) { + NfcoreTemplate.IM_notification(workflow, params, summary_params, projectDir, log) + } + // TreeValProject.summary(workflow, reference_tuple, summary_params, projectDir) + +} + +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + THE END +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ From 87cffd9dcb684e79cfe656f1142c6f956dc497c8 Mon Sep 17 00:00:00 2001 From: eeaunin Date: Thu, 16 Nov 2023 17:15:25 +0000 Subject: [PATCH 03/23] Edited the main workflow --- workflows/ascc.nf | 2 ++ 1 file changed, 2 insertions(+) diff --git a/workflows/ascc.nf b/workflows/ascc.nf index 8a98914..1f596ed 100644 --- a/workflows/ascc.nf +++ b/workflows/ascc.nf @@ -80,10 +80,12 @@ workflow ASCC { // // MODULE: CALCULATE GC CONTENT PER SCAFFOLD IN INPUT FASTA // + /* GC_CONTENT ( YAML_INPUT.out.reference_tuple ) ch_versions = ch_versions.mix(GC_CONTENT.out.versions) + */ // // SUBWORKFLOW: GENERATE GENOME FILE From f045b698c223dce8b0af5893fa8b7d9977e9f320 Mon Sep 17 00:00:00 2001 From: eeaunin Date: Thu, 16 Nov 2023 17:28:06 +0000 Subject: [PATCH 04/23] Edited the test yaml file --- assets/test.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/assets/test.yaml b/assets/test.yaml index d418a26..5e1e398 100755 --- a/assets/test.yaml +++ b/assets/test.yaml @@ -1,4 +1,4 @@ -assembly_path: /lustre/scratch124/tol/projects/tol/data/insects/Polyommatus_atlantica/assembly/draft/treeval/ilPolAtla1_merged/raw/ref.fa +assembly_path: /lustre/scratch123/tol/teams/tola/users/ea10/pipeline_testing/20231114_pyoelii_vecscreen/ref/PlasmoDB-58_Pyoeliiyoelii17XNL_Genome_with_adapters2.fasta assembly_title: asccTinyTest reads_path: /lustre/scratch123/tol/resources/treeval/treeval-testdata/asccTinyTest/pacbio/ reads_type: "hifi" From dcfb977fe5f7c29826e31410d8f8de66c35b7ac3 Mon Sep 17 00:00:00 2001 From: eeaunin Date: Thu, 16 Nov 2023 17:31:05 +0000 Subject: [PATCH 05/23] Edited the test yaml file --- assets/test.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/assets/test.yaml b/assets/test.yaml index 5e1e398..cf55c9d 100755 --- a/assets/test.yaml +++ b/assets/test.yaml @@ -1,4 +1,4 @@ -assembly_path: /lustre/scratch123/tol/teams/tola/users/ea10/pipeline_testing/20231114_pyoelii_vecscreen/ref/PlasmoDB-58_Pyoeliiyoelii17XNL_Genome_with_adapters2.fasta +assembly_path: /lustre/scratch123/tol/teams/tola/users/ea10/pipeline_testing/20231114_pyoelii_vecscreen/ref/PlasmoDB-58_Pyoeliiyoelii17XNL_Genome_with_adapters2_fh.fasta assembly_title: asccTinyTest reads_path: /lustre/scratch123/tol/resources/treeval/treeval-testdata/asccTinyTest/pacbio/ reads_type: "hifi" From 320e0716cd0521bf5f7d6b26a94d9e3036540c5b Mon Sep 17 00:00:00 2001 From: eeaunin Date: Thu, 16 Nov 2023 17:36:16 +0000 Subject: [PATCH 06/23] Edited the test yaml file --- assets/test.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/assets/test.yaml b/assets/test.yaml index cf55c9d..2f3f9e1 100755 --- a/assets/test.yaml +++ b/assets/test.yaml @@ -1,4 +1,4 @@ -assembly_path: /lustre/scratch123/tol/teams/tola/users/ea10/pipeline_testing/20231114_pyoelii_vecscreen/ref/PlasmoDB-58_Pyoeliiyoelii17XNL_Genome_with_adapters2_fh.fasta +assembly_path: /lustre/scratch123/tol/teams/tola/users/ea10/pipeline_testing/20231114_pyoelii_vecscreen/ref/PlasmoDB-58_Pyoeliiyoelii17XNL_Genome_with_adapters2_fh2.fasta assembly_title: asccTinyTest reads_path: /lustre/scratch123/tol/resources/treeval/treeval-testdata/asccTinyTest/pacbio/ reads_type: "hifi" From ddc2f300dfde6985bafc5fdca8c1653bdf4cc705 Mon Sep 17 00:00:00 2001 From: eeaunin Date: Thu, 16 Nov 2023 18:00:31 +0000 Subject: [PATCH 07/23] Deleted trailing Ns scripts and reformatted the VecScreen Python scripts with black --- bin/VSlistTo1HitPerLine.py | 18 +++- bin/chunk_assembly_for_vecscreen.py | 52 +++++----- bin/summarise_vecscreen_output.py | 13 ++- bin/trim_Ns.py | 144 ---------------------------- modules/local/detect_trailing_n.nf | 42 -------- 5 files changed, 51 insertions(+), 218 deletions(-) delete mode 100755 bin/trim_Ns.py delete mode 100644 modules/local/detect_trailing_n.nf diff --git a/bin/VSlistTo1HitPerLine.py b/bin/VSlistTo1HitPerLine.py index 55b8ca7..87ea39d 100755 --- a/bin/VSlistTo1HitPerLine.py +++ b/bin/VSlistTo1HitPerLine.py @@ -16,6 +16,7 @@ import re import argparse + def main(args): hits_to_report = "" ID = "" @@ -27,14 +28,14 @@ def main(args): line = line.strip() Fld = line.split(" ") - if not re.match(r'^[0-9 \t]+$', line): + if not re.match(r"^[0-9 \t]+$", line): hits_to_report = "" if hits_to_report: print(f"VecScreen_{hits_to_report.ljust(8)}\t{ID}\t{line}") hits += 1 continue - + if line.startswith(">Vector "): if ID != "": if error_found: @@ -75,13 +76,20 @@ def main(args): if __name__ == "__main__": parser = argparse.ArgumentParser(description="Reformatting VecScreen's output") - parser.add_argument("vecscreen_output_file", type=str, help="Path to a raw output file from NCBI VecScreen (from a run with the -f3 flag)", default=None) + parser.add_argument( + "vecscreen_output_file", + type=str, + help="Path to a raw output file from NCBI VecScreen (from a run with the -f3 flag)", + default=None, + ) parser.add_argument("--skip_reporting_strong_hits", action="store_true", help="Skip reporting strong hits") parser.add_argument("--skip_reporting_moderate_hits", action="store_true", help="Skip reporting moderate hits") parser.add_argument("--skip_reporting_weak_hits", action="store_true", help="Skip reporting weak hits") - parser.add_argument("--skip_reporting_suspect_hits", action="store_true", help="Skip reporting hits of suspect origin") + parser.add_argument( + "--skip_reporting_suspect_hits", action="store_true", help="Skip reporting hits of suspect origin" + ) parser.add_argument("--skip_reporting_no_hits", action="store_true", help="Skip reporting no-hits") parser.add_argument("--skip_reporting_errors", action="store_true", help="Skip reporting errors") parser.add_argument("-v", "--version", action="version", version="1.0") args = parser.parse_args() - main(args) \ No newline at end of file + main(args) diff --git a/bin/chunk_assembly_for_vecscreen.py b/bin/chunk_assembly_for_vecscreen.py index 5721745..321777c 100755 --- a/bin/chunk_assembly_for_vecscreen.py +++ b/bin/chunk_assembly_for_vecscreen.py @@ -7,42 +7,44 @@ import argparse import os -def main(fasta_input_file, fasta_output_file): - fasta_input_file = os.path.abspath(fasta_input_file) - fasta_output_file = os.path.abspath(fasta_output_file) - threshold_length = 500000 - overlap_length = int( threshold_length / 10 ) - minimum_record_size = 11 +def main(fasta_input_file, fasta_output_file): + fasta_input_file = os.path.abspath(fasta_input_file) + fasta_output_file = os.path.abspath(fasta_output_file) - fasta_output_handle = open(fasta_output_file, 'w') + threshold_length = 500000 + overlap_length = int(threshold_length / 10) + minimum_record_size = 11 - with open(fasta_input_file, 'r') as fasta_input_handle: - for record in SeqIO.parse(fasta_input_handle, "fasta"): + fasta_output_handle = open(fasta_output_file, "w") - if len(record) >= minimum_record_size: - records_to_write = [] + with open(fasta_input_file, "r") as fasta_input_handle: + for record in SeqIO.parse(fasta_input_handle, "fasta"): + if len(record) >= minimum_record_size: + records_to_write = [] - slice_count = 0 - while (slice_count * threshold_length) < len(record) - (threshold_length+overlap_length): - record_slice = record[(slice_count*threshold_length):((slice_count+1)*threshold_length + overlap_length)] - record_slice.id += '.chunk_' + str(slice_count+1) + slice_count = 0 + while (slice_count * threshold_length) < len(record) - (threshold_length + overlap_length): + record_slice = record[ + (slice_count * threshold_length) : ((slice_count + 1) * threshold_length + overlap_length) + ] + record_slice.id += ".chunk_" + str(slice_count + 1) - record_slice.description = '' - records_to_write.append(record_slice) - slice_count += 1 + record_slice.description = "" + records_to_write.append(record_slice) + slice_count += 1 - final_record_slice = record[(slice_count*threshold_length):] + final_record_slice = record[(slice_count * threshold_length) :] - if slice_count > 0: - final_record_slice.id += '.chunk_' + str(slice_count+1) - final_record_slice.description = '' + if slice_count > 0: + final_record_slice.id += ".chunk_" + str(slice_count + 1) + final_record_slice.description = "" - records_to_write.append(final_record_slice) + records_to_write.append(final_record_slice) - SeqIO.write(records_to_write, fasta_output_handle, 'fasta') + SeqIO.write(records_to_write, fasta_output_handle, "fasta") - fasta_output_handle.close() + fasta_output_handle.close() if __name__ == "__main__": diff --git a/bin/summarise_vecscreen_output.py b/bin/summarise_vecscreen_output.py index a513f53..50cb173 100755 --- a/bin/summarise_vecscreen_output.py +++ b/bin/summarise_vecscreen_output.py @@ -46,8 +46,17 @@ def main(vecscreen_file, chunk_size): if __name__ == "__main__": parser = argparse.ArgumentParser(description=__doc__) - parser.add_argument("vecscreen_file", type=str, help="Path to output file of VecScreen (run with -f3 flag), filtered with VSlistTo1HitPerLine.awk") - parser.add_argument("--chunk_size", type=int, help="Chunk size of the chunked FASTA file that VecScreen was run with, in bp. Default: 500000", default=50000) + parser.add_argument( + "vecscreen_file", + type=str, + help="Path to output file of VecScreen (run with -f3 flag), filtered with VSlistTo1HitPerLine.awk", + ) + parser.add_argument( + "--chunk_size", + type=int, + help="Chunk size of the chunked FASTA file that VecScreen was run with, in bp. Default: 500000", + default=50000, + ) parser.add_argument("-v", "--version", action="version", version="1.0") args = parser.parse_args() main(args.vecscreen_file, args.chunk_size) diff --git a/bin/trim_Ns.py b/bin/trim_Ns.py deleted file mode 100755 index 3f20eb6..0000000 --- a/bin/trim_Ns.py +++ /dev/null @@ -1,144 +0,0 @@ -#!/usr/bin/env python3 -""" -Script for detecting trailing Ns that should be trimmed from an assembly, from James Torrance. Edited by Eerik Aunin -""" - -import re -from Bio import SeqIO -import argparse - -def main(fasta_file, output_file): - - minleftover = 200 # after trimming start/end, at least this many bp should be left - winsize = 5000 # for sliding window analysis - minslidingBase = 0.4 # maximum fraction of Ns in sliding window before alarm sets off - - output_handle = open(output_file, 'w') - with open(fasta_file, "r") as fasta_input_handle: - for record in SeqIO.parse(fasta_input_handle, "fasta"): - # output_handle.write('Testing ' + record.id + "\n") - - # Do this twice: once with the regular string, once with the reversed string - # The output should be as one or more variables that can be reversed for the second iteration - seq_string = str(record.seq) - - n_count = seq_string.count('N') + seq_string.count('n') - n_perc = n_count/ len(seq_string) - if n_perc > 0.8: - output_handle.write('# WARNING: ' + record.id + '\t' + str(int(n_perc * 10000)/100) + ' % Ns of total ' + str(len(seq_string)) + "\n") - - # Consider handling this via two separate regexps for start and end. Adding character-class might be escalating run-time - #terminal_n_match = re.match('^([Nn]*)(\S+?)([Nn]*)$', seq_string) - start_n_match = re.match('^([Nn]*)', seq_string) - end_n_match = re.search('^([Nn]*)', seq_string[::-1]) - - startseq = '' - if start_n_match: - startseq = start_n_match.group(1) - endseq = '' - if end_n_match: - endseq = end_n_match.group(1) - realseq_length = len(seq_string) - len(startseq) - len(endseq) - # Handle "all Ns exception" - if len(startseq) == len(seq_string): - realseq_length = 0 - endseq = '' - - if len(startseq) > 0 or len(endseq) > 0: - if len(startseq) > 0: - output_handle.write('\t'.join(["TRIM:",record.id,'1',str(len(startseq))]) + "\t\n") - if len(endseq) > 0: - output_handle.write('\t'.join(["TRIM:",record.id,str(len(startseq)+realseq_length+1),str(len(seq_string))]) + "\t\n") - - if realseq_length <= minleftover: - output_handle.write("REMOVE: " + record.id + '\t' + str(realseq_length) + " bp leftover after trimming\n") - - - # Only attempt the windowing approach if there's an initial window that doesn't meet the trigger condition - for seq_string_loop, seq_string_for_window in enumerate([seq_string, seq_string[::-1]]): - if len(seq_string_for_window) > winsize and (seq_string_for_window[:winsize].count('N') + seq_string_for_window[:winsize].count('n')) > winsize * minslidingBase: - - non_n_regions = [] - non_n_iterator = re.finditer('[^Nn]+', seq_string_for_window) - for non_n_instance in non_n_iterator: - # output_handle.write(record.id + '\t' + str(non_n_instance.start(0)) + '\t' + str(non_n_instance.end(0)) + '\n') - - current_non_n_start = non_n_instance.start(0) + 1 - current_non_n_end = non_n_instance.end(0) - - non_n_regions.insert(0, [current_non_n_start, current_non_n_end]) - - # Does the *end* of this block satisfy the window condition? - bases_in_window = 0 - start_of_first_base_block_in_window = None - for non_n_region in non_n_regions: - if non_n_region[1] >= current_non_n_end - winsize: - start_of_first_base_block_in_window = non_n_region[0] - if non_n_region[0] >= current_non_n_end - winsize: - bases_in_window += non_n_region[1] - non_n_region[0] + 1 - else: - bases_in_window += non_n_region[1] - non_n_region[0] + 1 - break - else: - break - - if bases_in_window >= minslidingBase * winsize: - # Because start positions have BED-style - - # Remember that the blocks are in *reverse* order along the sequence - trackback_to_start_flag = False - - for i, non_n_region in enumerate(non_n_regions): - if i == 0: - continue - bases_in_window_2 = 0 - if non_n_region[1] < non_n_regions[0][0] - winsize: - break - else: - current_window_start = max(non_n_region[0], non_n_regions[0][0] - winsize) - - # Count the bases from this block - bases_in_window_2 += min(non_n_region[1], (current_window_start+winsize-1) ) - current_window_start + 1 - - # Add up all the sequence in blocks tested thus far, but not the final block - for j in range(1, i): - bases_in_window_2 += non_n_regions[j][1] - non_n_regions[j][0] + 1 - - # Count the sequence in the final block that can contribute - # This is the region between the start of the final block and either the end of the block - # or the end of a window extending from the current test start point, whichever comes first - bases_in_window_2 += min( non_n_regions[0][1], (current_window_start + winsize-1) ) - non_n_regions[0][0] + 1 - - # output_handle.write('BIW: ' + str(i) + ' ' + str(bases_in_window_2) + "\n") - if bases_in_window_2 >= minslidingBase * winsize: - if current_window_start == non_n_region[0]: - start_of_first_base_block_in_window = current_window_start - else: - start_of_first_base_block_in_window = non_n_regions[i-1][0] - else: - # We keep going back until the threshold condition is *not* met - break - - # If we find the break-point should be before the first block, then we don't want to trim at all! - if i == len(non_n_regions) - 1: - trackback_to_start_flag = True - - - # Only trim if the breakpoint isn't before the first block - # and if the breakpoint isn't at the start of the sequence - if not(trackback_to_start_flag) and start_of_first_base_block_in_window != 1: - if seq_string_loop == 0: - output_handle.write("FWDCLIP:\t" + record.id + "\t1\t" + str(start_of_first_base_block_in_window - 1) + "\n") - else: - output_handle.write("REVCLIP:\t" + record.id + '\t' + str(len(seq_string) - start_of_first_base_block_in_window + 2) + '\t' + str(len(seq_string)) + "\n") - - break - - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description=__doc__) - parser.add_argument("fasta_file", type=str, help="Path to input FASTA file") - parser.add_argument("output_file", type=str, help="Path for output report file") - parser.add_argument("-v", "--version", action="version", version="1.0") - args = parser.parse_args() - main(args.fasta_file, args.output_file) \ No newline at end of file diff --git a/modules/local/detect_trailing_n.nf b/modules/local/detect_trailing_n.nf deleted file mode 100644 index 0ddf966..0000000 --- a/modules/local/detect_trailing_n.nf +++ /dev/null @@ -1,42 +0,0 @@ -process DETECT_TRAILING_N { - tag "$meta.id" - label 'process_single' - - conda "bioconda::biopython=1.70" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/biopython:1.70--np112py35_1': - 'quay.io/biocontainers/biopython:1.70--np112py35_1' }" - - input: - tuple val(meta), path(fasta_input_file) - - output: - tuple val(meta), path("*_trim_Ns"), emit: trailing_n_report - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def prefix = task.ext.prefix ?: "${meta.id}" - """ - trim_Ns.py ${fasta_input_file} ${prefix}_trim_Ns - cat <<-END_VERSIONS > versions.yml - "${task.process}": - python: \$(python --version | sed 's/Python //g') - trim_Ns.py: \$(trim_Ns.py --version | cut -d' ' -f2) - END_VERSIONS - """ - - stub: - def prefix = task.ext.prefix ?: "${meta.id}" - """ - touch ${prefix}_trim_Ns - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - python: \$(python --version | sed 's/Python //g') - trim_Ns.py: \$(trim_Ns.py --version | cut -d' ' -f2) - END_VERSIONS - """ -} \ No newline at end of file From f39e4c52e02bace81b97a2bcf19439c46572d9b8 Mon Sep 17 00:00:00 2001 From: eeaunin Date: Thu, 16 Nov 2023 20:57:49 +0000 Subject: [PATCH 08/23] Trying a change in github_testing/test.yaml --- assets/github_testing/test.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/assets/github_testing/test.yaml b/assets/github_testing/test.yaml index bb1c09f..57e9d5c 100755 --- a/assets/github_testing/test.yaml +++ b/assets/github_testing/test.yaml @@ -20,7 +20,7 @@ busco_lineages_folder: /home/runner/work/ascc/ascc/busco_database/lineages fcs_gx_database_path: /home/runner/work/ascc/ascc/FCS_gx/ diamond_uniprot_database_path: /home/runner/work/ascc/ascc/diamond/UP000000212_1234679_tax.dmnd diamond_nr_database_path: /home/runner/work/ascc/ascc/diamond/UP000000212_1234679_tax.dmnd -vecscreen_database_path: /home/runner/work/ascc/ascc/vecscreen/GCA_900155405.1_PRJEB18760_genomic.fna +vecscreen_database_path: /home/runner/work/ascc/ascc/kraken2/kraken2 seqkit: sliding: 6000 window: 100000 From d5d7838862b661db6e2b8e6fbbeabc9d5490e194 Mon Sep 17 00:00:00 2001 From: eeaunin Date: Thu, 16 Nov 2023 21:13:56 +0000 Subject: [PATCH 09/23] Trying a change in github_testing/test.yaml --- assets/github_testing/test.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/assets/github_testing/test.yaml b/assets/github_testing/test.yaml index 57e9d5c..e4e7b82 100755 --- a/assets/github_testing/test.yaml +++ b/assets/github_testing/test.yaml @@ -20,7 +20,7 @@ busco_lineages_folder: /home/runner/work/ascc/ascc/busco_database/lineages fcs_gx_database_path: /home/runner/work/ascc/ascc/FCS_gx/ diamond_uniprot_database_path: /home/runner/work/ascc/ascc/diamond/UP000000212_1234679_tax.dmnd diamond_nr_database_path: /home/runner/work/ascc/ascc/diamond/UP000000212_1234679_tax.dmnd -vecscreen_database_path: /home/runner/work/ascc/ascc/kraken2/kraken2 +vecscreen_database_path: /home/runner/work/ascc/ascc/NT_database seqkit: sliding: 6000 window: 100000 From 234803aa7093a73db4eb2a0be4aa401c6650ca90 Mon Sep 17 00:00:00 2001 From: yumisims Date: Fri, 17 Nov 2023 14:27:29 +0000 Subject: [PATCH 10/23] add vecscreen db --- .github/workflows/ci.yml | 2 +- assets/github_testing/test.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 83d2195..b7c0b09 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -82,7 +82,7 @@ jobs: - name: Download the vecscreen test data run: | mkdir vecscreen - wget -c https://tolit.cog.sanger.ac.uk/test-data/resources/vecscreen/GCA_900155405.1_PRJEB18760_genomic.fna -O vecscreen/GCA_900155405.1_PRJEB18760_genomic.fna + curl -L https://ftp.ncbi.nlm.nih.gov/blast/db/v4/16SMicrobial_v4.tar.gz | tar -C vecscreen -xzf - - name: Run pipeline with test data # TODO nf-core: You can customise CI pipeline run tests as required diff --git a/assets/github_testing/test.yaml b/assets/github_testing/test.yaml index e4e7b82..7b35de1 100755 --- a/assets/github_testing/test.yaml +++ b/assets/github_testing/test.yaml @@ -20,7 +20,7 @@ busco_lineages_folder: /home/runner/work/ascc/ascc/busco_database/lineages fcs_gx_database_path: /home/runner/work/ascc/ascc/FCS_gx/ diamond_uniprot_database_path: /home/runner/work/ascc/ascc/diamond/UP000000212_1234679_tax.dmnd diamond_nr_database_path: /home/runner/work/ascc/ascc/diamond/UP000000212_1234679_tax.dmnd -vecscreen_database_path: /home/runner/work/ascc/ascc/NT_database +vecscreen_database_path: /home/runner/work/ascc/ascc/vecscreen seqkit: sliding: 6000 window: 100000 From 1f628989e76150b3a7774add30ca64cfc0e7ef08 Mon Sep 17 00:00:00 2001 From: yumisims Date: Fri, 17 Nov 2023 15:07:15 +0000 Subject: [PATCH 11/23] putting in blastdb tuple generation --- subworkflows/local/run_vecscreen.nf | 20 ++++++++++++++++++-- workflows/ascc.nf | 3 +-- 2 files changed, 19 insertions(+), 4 deletions(-) diff --git a/subworkflows/local/run_vecscreen.nf b/subworkflows/local/run_vecscreen.nf index 645fe82..2bcf027 100644 --- a/subworkflows/local/run_vecscreen.nf +++ b/subworkflows/local/run_vecscreen.nf @@ -10,19 +10,35 @@ include { SUMMARISE_VECSCREEN_OUTPUT } from '../../modules/local/summarise_ workflow RUN_VECSCREEN { take: reference_tuple // val(meta), path(fasta) - adapters_blastn_database_directory + vecscreen_database_path main: ch_versions = Channel.empty() // MODULE: CHUNKS THE ASSEMBLY INTO PIECES WITH FIXED LENGTH CHUNK_ASSEMBLY_FOR_VECSCREEN(reference_tuple) ch_versions = ch_versions.mix(CHUNK_ASSEMBLY_FOR_VECSCREEN.out.versions) + + reference_tuple + .combine( vecscreen_database_path ) + .map { meta, ref, vecscreen_database_path -> + tuple( + [ id : meta.id + ], + vecscreen_database_path + ) + } + .set { vecscreen_database } + + + // MODULE: RUNS NCBI VECSCREEN - NCBITOOLS_VECSCREEN(CHUNK_ASSEMBLY_FOR_VECSCREEN.out.chunked_assembly, [ [id:'vecscreen_adapters_database'], adapters_blastn_database_directory ]) + NCBITOOLS_VECSCREEN(CHUNK_ASSEMBLY_FOR_VECSCREEN.out.chunked_assembly, vecscreen_database) ch_versions = ch_versions.mix(NCBITOOLS_VECSCREEN.out.versions) + // MODULE: REFORMATS VECSCREEN OUTPUT AND FILTERS IT TO REMOVE NO-HIT RESULTS AND KEEP ONLY HITS FILTER_VECSCREEN_RESULTS(NCBITOOLS_VECSCREEN.out.vecscreen_output) ch_versions = ch_versions.mix(FILTER_VECSCREEN_RESULTS.out.versions) + // MODULE: CONVERTS COORDINATES IN ASSEMBLY CHUNKS BACK TO COORDINATES IN THE WHOLE ASSEMBLY AND WRITES A REPORT FILE SUMMARISE_VECSCREEN_OUTPUT(FILTER_VECSCREEN_RESULTS.out.filtered_vecscreen_outfile) ch_versions = ch_versions.mix(SUMMARISE_VECSCREEN_OUTPUT.out.versions) diff --git a/workflows/ascc.nf b/workflows/ascc.nf index 1f596ed..aca9975 100644 --- a/workflows/ascc.nf +++ b/workflows/ascc.nf @@ -235,8 +235,7 @@ workflow ASCC { RUN_VECSCREEN ( GENERATE_GENOME.out.reference_tuple, - "/lustre/scratch123/tol/teams/tola/users/ea10/ascc_databases/vecscreen_database" - //YAML_INPUT.out.vecscreen_database_path + YAML_INPUT.out.vecscreen_database_path ) ch_versions = ch_versions.mix(RUN_VECSCREEN.out.versions) From a1cd1ffbb02e5d9a41c691dd32fedb0490008cc8 Mon Sep 17 00:00:00 2001 From: yumisims Date: Fri, 17 Nov 2023 15:18:26 +0000 Subject: [PATCH 12/23] refactor chunk fasta --- bin/chunk_assembly_for_vecscreen.py | 64 ++++++++++++++++------------- 1 file changed, 35 insertions(+), 29 deletions(-) diff --git a/bin/chunk_assembly_for_vecscreen.py b/bin/chunk_assembly_for_vecscreen.py index 321777c..576064e 100755 --- a/bin/chunk_assembly_for_vecscreen.py +++ b/bin/chunk_assembly_for_vecscreen.py @@ -1,6 +1,7 @@ #!/usr/bin/env python3 """ -Script for chunking an assembly before running NCBI VecScreen. Adapted from a script by James Torrance, edited by Eerik Aunin +Script for chunking an assembly before running NCBI VecScreen. Adapted from a script by James Torrance. +The script was further refactored by Eerik Aunin and Yumi Sims """ from Bio import SeqIO @@ -8,6 +9,30 @@ import os +def process_record(record, chunk_num, threshold_length, overlap_length): + return ( + record[ + chunk_num * threshold_length : ((chunk_num + 1) * threshold_length + overlap_length) + ] + if chunk_num * threshold_length < len(record) - (threshold_length + overlap_length) + else record[chunk_num * threshold_length :] + ) + + +def generate_records_to_write(record, threshold_length, overlap_length): + return [ + ( + record_slice.id + f".chunk_{chunk_num + 1}", + "", + record_slice, + ) + for chunk_num, record_slice in enumerate( + process_record(record, i, threshold_length, overlap_length) + for i in range((len(record) - 1) // threshold_length + 1) + ) + ] + + def main(fasta_input_file, fasta_output_file): fasta_input_file = os.path.abspath(fasta_input_file) fasta_output_file = os.path.abspath(fasta_output_file) @@ -16,35 +41,15 @@ def main(fasta_input_file, fasta_output_file): overlap_length = int(threshold_length / 10) minimum_record_size = 11 - fasta_output_handle = open(fasta_output_file, "w") - - with open(fasta_input_file, "r") as fasta_input_handle: - for record in SeqIO.parse(fasta_input_handle, "fasta"): - if len(record) >= minimum_record_size: - records_to_write = [] - - slice_count = 0 - while (slice_count * threshold_length) < len(record) - (threshold_length + overlap_length): - record_slice = record[ - (slice_count * threshold_length) : ((slice_count + 1) * threshold_length + overlap_length) - ] - record_slice.id += ".chunk_" + str(slice_count + 1) + try: + with open(fasta_output_file, "w") as fasta_output_handle: + for record in SeqIO.parse(fasta_input_file, "fasta"): + if len(record) >= minimum_record_size: + records_to_write = generate_records_to_write(record, threshold_length, overlap_length) + SeqIO.write(records_to_write, fasta_output_handle, "fasta") - record_slice.description = "" - records_to_write.append(record_slice) - slice_count += 1 - - final_record_slice = record[(slice_count * threshold_length) :] - - if slice_count > 0: - final_record_slice.id += ".chunk_" + str(slice_count + 1) - final_record_slice.description = "" - - records_to_write.append(final_record_slice) - - SeqIO.write(records_to_write, fasta_output_handle, "fasta") - - fasta_output_handle.close() + except Exception as e: + print(f"An error occurred: {e}") if __name__ == "__main__": @@ -54,3 +59,4 @@ def main(fasta_input_file, fasta_output_file): parser.add_argument("-v", "--version", action="version", version="1.0") args = parser.parse_args() main(args.fasta_input_file, args.fasta_output_file) + From 6cc19bc9f79712b85f2a2e0cd64e488174d5662e Mon Sep 17 00:00:00 2001 From: yumisims Date: Fri, 17 Nov 2023 15:21:02 +0000 Subject: [PATCH 13/23] black --- bin/chunk_assembly_for_vecscreen.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/bin/chunk_assembly_for_vecscreen.py b/bin/chunk_assembly_for_vecscreen.py index 576064e..7f08e54 100755 --- a/bin/chunk_assembly_for_vecscreen.py +++ b/bin/chunk_assembly_for_vecscreen.py @@ -11,9 +11,7 @@ def process_record(record, chunk_num, threshold_length, overlap_length): return ( - record[ - chunk_num * threshold_length : ((chunk_num + 1) * threshold_length + overlap_length) - ] + record[chunk_num * threshold_length : ((chunk_num + 1) * threshold_length + overlap_length)] if chunk_num * threshold_length < len(record) - (threshold_length + overlap_length) else record[chunk_num * threshold_length :] ) @@ -59,4 +57,3 @@ def main(fasta_input_file, fasta_output_file): parser.add_argument("-v", "--version", action="version", version="1.0") args = parser.parse_args() main(args.fasta_input_file, args.fasta_output_file) - From dea8d118fdd808d369a8339346c8fc2dcd81bcff Mon Sep 17 00:00:00 2001 From: yumisims Date: Fri, 17 Nov 2023 15:25:58 +0000 Subject: [PATCH 14/23] black --- bin/chunk_assembly_for_vecscreen.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/chunk_assembly_for_vecscreen.py b/bin/chunk_assembly_for_vecscreen.py index 7f08e54..38bce8d 100755 --- a/bin/chunk_assembly_for_vecscreen.py +++ b/bin/chunk_assembly_for_vecscreen.py @@ -20,7 +20,7 @@ def process_record(record, chunk_num, threshold_length, overlap_length): def generate_records_to_write(record, threshold_length, overlap_length): return [ ( - record_slice.id + f".chunk_{chunk_num + 1}", + record_slice.id + ".chunk_{}".format(chunk_num + 1), "", record_slice, ) From 8bcae07fcdfa5325d1134d1dfed3f1d78aeaf0fa Mon Sep 17 00:00:00 2001 From: yumisims Date: Fri, 17 Nov 2023 15:35:22 +0000 Subject: [PATCH 15/23] the default python is 3.5 so change the beloved f striiiiing --- bin/chunk_assembly_for_vecscreen.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/chunk_assembly_for_vecscreen.py b/bin/chunk_assembly_for_vecscreen.py index 38bce8d..e36d1ed 100755 --- a/bin/chunk_assembly_for_vecscreen.py +++ b/bin/chunk_assembly_for_vecscreen.py @@ -47,7 +47,7 @@ def main(fasta_input_file, fasta_output_file): SeqIO.write(records_to_write, fasta_output_handle, "fasta") except Exception as e: - print(f"An error occurred: {e}") + print("An error occurred: {}".format(e)) if __name__ == "__main__": From 77a98a7433ce8c32e465fa737d7b8c6e5cb4e70a Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Fri, 17 Nov 2023 16:33:00 +0000 Subject: [PATCH 16/23] Formatting and minor changes to channels --- conf/modules.config | 3 + modules/local/chunk_assembly_for_vecscreen.nf | 10 ++-- modules/local/filter_vecscreen_results.nf | 7 ++- modules/local/summarise_vecscreen_output.nf | 2 + subworkflows/local/run_vecscreen.nf | 59 +++++++++---------- subworkflows/local/yaml_input.nf | 11 +++- workflows/ascc.nf | 51 ++++++---------- 7 files changed, 73 insertions(+), 70 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 2febc01..0820f4a 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -99,4 +99,7 @@ process { ext.args = { "-f3" } } + withName: FILTER_VECSCREEN_RESULTS { + ext.args = "--skip_reporting_suspect_hits --skip_reporting_weak_hits --skip_reporting_no_hits" + } } diff --git a/modules/local/chunk_assembly_for_vecscreen.nf b/modules/local/chunk_assembly_for_vecscreen.nf index 727e04b..9969e42 100644 --- a/modules/local/chunk_assembly_for_vecscreen.nf +++ b/modules/local/chunk_assembly_for_vecscreen.nf @@ -11,15 +11,16 @@ process CHUNK_ASSEMBLY_FOR_VECSCREEN { tuple val(meta), path(fasta_input_file) output: - tuple val(meta), path("chunked_assembly.fa"), emit: chunked_assembly - path "versions.yml" , emit: versions + tuple val(meta), path("*_chunked_assembly.fa") , emit: chunked_assembly + path "versions.yml" , emit: versions when: task.ext.when == null || task.ext.when script: + def prefix = args.ext.prefix ?: "${meta.id}" """ - chunk_assembly_for_vecscreen.py $fasta_input_file chunked_assembly.fa + chunk_assembly_for_vecscreen.py $fasta_input_file ${prefix}_chunked_assembly.fa cat <<-END_VERSIONS > versions.yml "${task.process}": @@ -29,8 +30,9 @@ process CHUNK_ASSEMBLY_FOR_VECSCREEN { """ stub: + def prefix = args.ext.prefix ?: "${meta.id}" """ - touch chunked_assembly.fa + touch ${prefix}_chunked_assembly.fa cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/local/filter_vecscreen_results.nf b/modules/local/filter_vecscreen_results.nf index 38c701e..9cb1b3b 100644 --- a/modules/local/filter_vecscreen_results.nf +++ b/modules/local/filter_vecscreen_results.nf @@ -18,8 +18,10 @@ process FILTER_VECSCREEN_RESULTS { task.ext.when == null || task.ext.when script: + def prefix = args.ext.prefix ?: "${meta.id}" + def args = args.ext.args ?: '' """ - VSlistTo1HitPerLine.py --skip_reporting_suspect_hits --skip_reporting_weak_hits --skip_reporting_no_hits ${vecscreen_outfile} > vecscreen.grepped.out + VSlistTo1HitPerLine.py ${args} ${vecscreen_outfile} > ${prefix}_vecscreen.grepped.out cat <<-END_VERSIONS > versions.yml "${task.process}": @@ -29,8 +31,9 @@ process FILTER_VECSCREEN_RESULTS { """ stub: + def prefix = args.ext.prefix ?: "${meta.id}" """ - touch vecscreen.grepped.out + touch ${prefix}_vecscreen.grepped.out cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/local/summarise_vecscreen_output.nf b/modules/local/summarise_vecscreen_output.nf index 3154c3e..8268f3c 100644 --- a/modules/local/summarise_vecscreen_output.nf +++ b/modules/local/summarise_vecscreen_output.nf @@ -21,6 +21,7 @@ process SUMMARISE_VECSCREEN_OUTPUT { def prefix = task.ext.prefix ?: "${meta.id}" """ summarise_vecscreen_output.py ${vecscreen_filtered_outfile} > ${prefix}.vecscreen_contamination + cat <<-END_VERSIONS > versions.yml "${task.process}": python: \$(python --version | sed 's/Python //g') @@ -32,6 +33,7 @@ process SUMMARISE_VECSCREEN_OUTPUT { def prefix = task.ext.prefix ?: "${meta.id}" """ touch ${prefix}.vecscreen_contamination + cat <<-END_VERSIONS > versions.yml "${task.process}": python: \$(python --version | sed 's/Python //g') diff --git a/subworkflows/local/run_vecscreen.nf b/subworkflows/local/run_vecscreen.nf index 2bcf027..27a38b1 100644 --- a/subworkflows/local/run_vecscreen.nf +++ b/subworkflows/local/run_vecscreen.nf @@ -1,50 +1,49 @@ #!/usr/bin/env nextflow // MODULE IMPORT BLOCK -include { CHUNK_ASSEMBLY_FOR_VECSCREEN } from '../../modules/local/chunk_assembly_for_vecscreen' -include { NCBITOOLS_VECSCREEN } from '../../modules/nf-core/ncbitools/vecscreen/main' -include { FILTER_VECSCREEN_RESULTS } from '../../modules/local/filter_vecscreen_results' -include { SUMMARISE_VECSCREEN_OUTPUT } from '../../modules/local/summarise_vecscreen_output' +include { CHUNK_ASSEMBLY_FOR_VECSCREEN } from '../../modules/local/chunk_assembly_for_vecscreen' +include { NCBITOOLS_VECSCREEN } from '../../modules/nf-core/ncbitools/vecscreen/main' +include { FILTER_VECSCREEN_RESULTS } from '../../modules/local/filter_vecscreen_results' +include { SUMMARISE_VECSCREEN_OUTPUT } from '../../modules/local/summarise_vecscreen_output' workflow RUN_VECSCREEN { take: - reference_tuple // val(meta), path(fasta) - vecscreen_database_path + reference_tuple // val(meta), path(fasta) + vecscreen_database_tuple // val(meta), val(db_path) main: - ch_versions = Channel.empty() - // MODULE: CHUNKS THE ASSEMBLY INTO PIECES WITH FIXED LENGTH - CHUNK_ASSEMBLY_FOR_VECSCREEN(reference_tuple) - ch_versions = ch_versions.mix(CHUNK_ASSEMBLY_FOR_VECSCREEN.out.versions) - - reference_tuple - .combine( vecscreen_database_path ) - .map { meta, ref, vecscreen_database_path -> - tuple( - [ id : meta.id - ], - vecscreen_database_path - ) - } - .set { vecscreen_database } - + ch_versions = Channel.empty() + // + // MODULE: CHUNKS THE ASSEMBLY INTO PIECES WITH FIXED LENGTH + // + CHUNK_ASSEMBLY_FOR_VECSCREEN( reference_tuple ) + ch_versions = ch_versions.mix( CHUNK_ASSEMBLY_FOR_VECSCREEN.out.versions ) + // // MODULE: RUNS NCBI VECSCREEN - NCBITOOLS_VECSCREEN(CHUNK_ASSEMBLY_FOR_VECSCREEN.out.chunked_assembly, vecscreen_database) - ch_versions = ch_versions.mix(NCBITOOLS_VECSCREEN.out.versions) - + // + NCBITOOLS_VECSCREEN( + CHUNK_ASSEMBLY_FOR_VECSCREEN.out.chunked_assembly, + vecscreen_database_tuple + ) + ch_versions = ch_versions.mix( NCBITOOLS_VECSCREEN.out.versions ) + + // // MODULE: REFORMATS VECSCREEN OUTPUT AND FILTERS IT TO REMOVE NO-HIT RESULTS AND KEEP ONLY HITS - FILTER_VECSCREEN_RESULTS(NCBITOOLS_VECSCREEN.out.vecscreen_output) - ch_versions = ch_versions.mix(FILTER_VECSCREEN_RESULTS.out.versions) + // + FILTER_VECSCREEN_RESULTS( NCBITOOLS_VECSCREEN.out.vecscreen_output ) + ch_versions = ch_versions.mix( FILTER_VECSCREEN_RESULTS.out.versions ) + // // MODULE: CONVERTS COORDINATES IN ASSEMBLY CHUNKS BACK TO COORDINATES IN THE WHOLE ASSEMBLY AND WRITES A REPORT FILE - SUMMARISE_VECSCREEN_OUTPUT(FILTER_VECSCREEN_RESULTS.out.filtered_vecscreen_outfile) - ch_versions = ch_versions.mix(SUMMARISE_VECSCREEN_OUTPUT.out.versions) + // + SUMMARISE_VECSCREEN_OUTPUT( FILTER_VECSCREEN_RESULTS.out.filtered_vecscreen_outfile ) + ch_versions = ch_versions.mix( SUMMARISE_VECSCREEN_OUTPUT.out.versions ) emit: vecscreen_contamination = SUMMARISE_VECSCREEN_OUTPUT.out.vecscreen_contamination - versions = ch_versions.ifEmpty(null) // channel: [ versions.yml ] + versions = ch_versions.ifEmpty( null ) // channel: [ versions.yml ] } diff --git a/subworkflows/local/yaml_input.nf b/subworkflows/local/yaml_input.nf index 3ac41ec..964f29d 100644 --- a/subworkflows/local/yaml_input.nf +++ b/subworkflows/local/yaml_input.nf @@ -97,6 +97,15 @@ workflow YAML_INPUT { } .set{ ch_plastid } + group.assembly_title + .combine ( group.vecscreen_database_path ) + .map{ assembly_id, db_path -> + tuple( [ id: assembly_id ], + db_path + ) + } + .set{ ch_vecscreen } + emit: reference_tuple = ch_reference pacbio_tuple = ch_pacbio @@ -115,7 +124,7 @@ workflow YAML_INPUT { fcs_gx_database_path = group.fcs_gx_database_path diamond_uniprot_database_path = group.diamond_uniprot_database_path diamond_nr_database_path = group.diamond_nr_database_path - vecscreen_database_path = group.vecscreen_database_path + vecscreen_database_path = ch_vecscreen seqkit_sliding = seqkit.sliding_value seqkit_window = seqkit.window_value mito_tuple = ch_mito diff --git a/workflows/ascc.nf b/workflows/ascc.nf index aca9975..c61f2b7 100644 --- a/workflows/ascc.nf +++ b/workflows/ascc.nf @@ -24,22 +24,19 @@ WorkflowAscc.initialise(params, log) // // SUBWORKFLOW: Consisting of a mix of local and nf-core/modules // - -include { YAML_INPUT } from '../subworkflows/local/yaml_input' -include { GENERATE_GENOME } from '../subworkflows/local/generate_genome' -include { EXTRACT_TIARA_HITS } from '../subworkflows/local/extract_tiara_hits' -include { EXTRACT_NT_BLAST } from '../subworkflows/local/extract_nt_blast' -include { RUN_FCSADAPTOR } from '../subworkflows/local/run_fcsadaptor' -include { RUN_NT_KRAKEN } from '../subworkflows/local/run_nt_kraken' -include { RUN_FCSGX } from '../subworkflows/local/run_fcsgx' -include { PACBIO_BARCODE_CHECK } from '../subworkflows/local/pacbio_barcode_check' -include { RUN_READ_COVERAGE } from '../subworkflows/local/run_read_coverage' -include { RUN_VECSCREEN } from '../subworkflows/local/run_vecscreen' +include { YAML_INPUT } from '../subworkflows/local/yaml_input' +include { GENERATE_GENOME } from '../subworkflows/local/generate_genome' +include { EXTRACT_TIARA_HITS } from '../subworkflows/local/extract_tiara_hits' +include { EXTRACT_NT_BLAST } from '../subworkflows/local/extract_nt_blast' +include { RUN_FCSADAPTOR } from '../subworkflows/local/run_fcsadaptor' +include { RUN_NT_KRAKEN } from '../subworkflows/local/run_nt_kraken' +include { RUN_FCSGX } from '../subworkflows/local/run_fcsgx' +include { PACBIO_BARCODE_CHECK } from '../subworkflows/local/pacbio_barcode_check' +include { RUN_READ_COVERAGE } from '../subworkflows/local/run_read_coverage' +include { RUN_VECSCREEN } from '../subworkflows/local/run_vecscreen' include { ORGANELLAR_BLAST as PLASTID_ORGANELLAR_BLAST } from '../subworkflows/local/organellar_blast' include { ORGANELLAR_BLAST as MITO_ORGANELLAR_BLAST } from '../subworkflows/local/organellar_blast' - - // // MODULE: Local modules // @@ -125,77 +122,66 @@ workflow ASCC { // // SUBWORKFLOW: EXTRACT RESULTS HITS FROM NT-BLAST // -/* EXTRACT_NT_BLAST ( + EXTRACT_NT_BLAST ( modified_input, YAML_INPUT.out.nt_database, YAML_INPUT.out.ncbi_accessions, YAML_INPUT.out.ncbi_rankedlineage_path ) - ch_versions = ch_versions.mix(EXTRACT_NT_BLAST.out.versions) */ + ch_versions = ch_versions.mix(EXTRACT_NT_BLAST.out.versions) // // LOGIC: CHECK WHETHER THERE IS A MITO AND BRANCH // - - /* YAML_INPUT.out.mito_tuple .branch { meta, check -> valid: check != "NO MITO" invalid: check == "NO MITO" } .set { mito_check } - */ + // // SUBWORKFLOW: BLASTING FOR MITO ASSEMBLIES IN GENOME // - /* MITO_ORGANELLAR_BLAST ( YAML_INPUT.out.reference_tuple, YAML_INPUT.out.mito_var, mito_check.valid ) ch_versions = ch_versions.mix(MITO_ORGANELLAR_BLAST.out.versions) - */ // // LOGIC: CHECK WHETHER THERE IS A PLASTID AND BRANCH // - /* YAML_INPUT.out.plastid_tuple .branch { meta, check -> valid: check != "NO PLASTID" invalid: check == "NO PLASTID" } .set { plastid_check } - */ // // SUBWORKFLOW: BLASTING FOR PLASTID ASSEMBLIES IN GENOME // - /* PLASTID_ORGANELLAR_BLAST ( YAML_INPUT.out.reference_tuple, YAML_INPUT.out.plastid_var, plastid_check.valid ) ch_versions = ch_versions.mix(PLASTID_ORGANELLAR_BLAST.out.versions) - */ // // SUBWORKFLOW: // - /* RUN_FCSADAPTOR ( YAML_INPUT.out.reference_tuple ) ch_versions = ch_versions.mix(RUN_FCSADAPTOR.out.versions) - */ // // SUBWORKFLOW: // - /* RUN_FCSGX ( YAML_INPUT.out.reference_tuple, YAML_INPUT.out.fcs_gx_database_path, @@ -203,23 +189,21 @@ workflow ASCC { YAML_INPUT.out.ncbi_rankedlineage_path ) ch_versions = ch_versions.mix(RUN_FCSADAPTOR.out.versions) - */ // // SUBWORKFLOW: IDENTITY PACBIO BARCODES IN INPUT DATA // - /*PACBIO_BARCODE_CHECK ( + PACBIO_BARCODE_CHECK ( YAML_INPUT.out.reference_tuple, YAML_INPUT.out.pacbio_tuple, YAML_INPUT.out.pacbio_barcodes, YAML_INPUT.out.pacbio_multiplex_codes ) - ch_versions = ch_versions.mix(PACBIO_BARCODE_CHECK.out.versions)*/ + ch_versions = ch_versions.mix(PACBIO_BARCODE_CHECK.out.versions) // // SUBWORKFLOW: CALCULATE AVERAGE READ COVERAGE // - /* RUN_READ_COVERAGE ( YAML_INPUT.out.reference_tuple, YAML_INPUT.out.assembly_path, @@ -227,18 +211,19 @@ workflow ASCC { YAML_INPUT.out.reads_type ) ch_versions = ch_versions.mix(RUN_READ_COVERAGE.out.versions) - */ // // SUBWORKFLOW: COLLECT SOFTWARE VERSIONS // - RUN_VECSCREEN ( GENERATE_GENOME.out.reference_tuple, YAML_INPUT.out.vecscreen_database_path ) ch_versions = ch_versions.mix(RUN_VECSCREEN.out.versions) + // + // SUBWORKFLOW: Collates version data from prior subworflows + // CUSTOM_DUMPSOFTWAREVERSIONS ( ch_versions.unique().collectFile(name: 'collated_versions.yml') ) From 7317fd7c9854db1a9e4b12c023b29a703e2ff55c Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Fri, 17 Nov 2023 16:38:04 +0000 Subject: [PATCH 17/23] removed remnant file --- workflows/ascc_full.nf | 246 ----------------------------------------- 1 file changed, 246 deletions(-) delete mode 100644 workflows/ascc_full.nf diff --git a/workflows/ascc_full.nf b/workflows/ascc_full.nf deleted file mode 100644 index 3107122..0000000 --- a/workflows/ascc_full.nf +++ /dev/null @@ -1,246 +0,0 @@ -/* -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - PRINT PARAMS SUMMARY -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -*/ - -include { paramsSummaryLog; paramsSummaryMap } from 'plugin/nf-validation' - -def logo = NfcoreTemplate.logo(workflow, params.monochrome_logs) -def citation = '\n' + WorkflowMain.citation(workflow) + '\n' -def summary_params = paramsSummaryMap(workflow) - -// Print parameter summary log to screen -log.info logo + paramsSummaryLog(workflow) + citation - -WorkflowAscc.initialise(params, log) - -/* -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - IMPORT LOCAL MODULES/SUBWORKFLOWS -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -*/ - -// -// SUBWORKFLOW: Consisting of a mix of local and nf-core/modules -// - -include { YAML_INPUT } from '../subworkflows/local/yaml_input' -include { GENERATE_GENOME } from '../subworkflows/local/generate_genome' -include { EXTRACT_TIARA_HITS } from '../subworkflows/local/extract_tiara_hits' -include { EXTRACT_NT_BLAST } from '../subworkflows/local/extract_nt_blast' -include { RUN_FCSADAPTOR } from '../subworkflows/local/run_fcsadaptor' -include { RUN_NT_KRAKEN } from '../subworkflows/local/run_nt_kraken' -include { RUN_FCSGX } from '../subworkflows/local/run_fcsgx' -include { PACBIO_BARCODE_CHECK } from '../subworkflows/local/pacbio_barcode_check' -include { RUN_READ_COVERAGE } from '../subworkflows/local/run_read_coverage' -include { RUN_VECSCREEN } from '../subworkflows/local/run_vecscreen' -include { ORGANELLAR_BLAST as PLASTID_ORGANELLAR_BLAST } from '../subworkflows/local/organellar_blast' -include { ORGANELLAR_BLAST as MITO_ORGANELLAR_BLAST } from '../subworkflows/local/organellar_blast' - - - -// -// MODULE: Local modules -// -include { GC_CONTENT } from '../modules/local/gc_content' - -/* -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - IMPORT NF-CORE MODULES/SUBWORKFLOWS -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -*/ - -// -// MODULE: Installed directly from nf-core/modules -// -include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/custom/dumpsoftwareversions/main' - -/* -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - RUN MAIN WORKFLOW -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -*/ - -workflow ASCC { - - main: - ch_versions = Channel.empty() - - input_ch = Channel.fromPath(params.input, checkIfExists: true) - - // - // SUBWORKFLOW: DECODE YAML INTO PARAMETERS FOR PIPELINE - // - YAML_INPUT ( - input_ch - ) - ch_versions = ch_versions.mix(YAML_INPUT.out.versions) - - // - // MODULE: CALCULATE GC CONTENT PER SCAFFOLD IN INPUT FASTA - // - GC_CONTENT ( - YAML_INPUT.out.reference_tuple - ) - ch_versions = ch_versions.mix(GC_CONTENT.out.versions) - - // - // SUBWORKFLOW: GENERATE GENOME FILE - // - GENERATE_GENOME ( - YAML_INPUT.out.reference_tuple, - YAML_INPUT.out.pacbio_barcodes - ) - ch_versions = ch_versions.mix(GENERATE_GENOME.out.versions) - - // - // SUBWORKFLOW: EXTRACT RESULTS HITS FROM TIARA - // - EXTRACT_TIARA_HITS ( - GENERATE_GENOME.out.reference_tuple - ) - ch_versions = ch_versions.mix(EXTRACT_TIARA_HITS.out.versions) - - // - // LOGIC: INJECT SLIDING WINDOW VALUES INTO REFERENCE - // - YAML_INPUT.out.reference_tuple - .combine ( YAML_INPUT.out.seqkit_sliding.toInteger() ) - .combine ( YAML_INPUT.out.seqkit_window.toInteger() ) - .map { meta, ref, sliding, window -> - tuple([ id : meta.id, - sliding : sliding, - window : window - ], - file(ref) - )} - .set { modified_input } - - // - // SUBWORKFLOW: EXTRACT RESULTS HITS FROM NT-BLAST - // -/* EXTRACT_NT_BLAST ( - modified_input, - YAML_INPUT.out.nt_database, - YAML_INPUT.out.ncbi_accessions, - YAML_INPUT.out.ncbi_rankedlineage_path - ) - ch_versions = ch_versions.mix(EXTRACT_NT_BLAST.out.versions) */ - - // - // LOGIC: CHECK WHETHER THERE IS A MITO AND BRANCH - // - YAML_INPUT.out.mito_tuple - .branch { meta, check -> - valid: check != "NO MITO" - invalid: check == "NO MITO" - } - .set { mito_check } - - // - // SUBWORKFLOW: BLASTING FOR MITO ASSEMBLIES IN GENOME - // - MITO_ORGANELLAR_BLAST ( - YAML_INPUT.out.reference_tuple, - YAML_INPUT.out.mito_var, - mito_check.valid - ) - ch_versions = ch_versions.mix(MITO_ORGANELLAR_BLAST.out.versions) - - // - // LOGIC: CHECK WHETHER THERE IS A PLASTID AND BRANCH - // - YAML_INPUT.out.plastid_tuple - .branch { meta, check -> - valid: check != "NO PLASTID" - invalid: check == "NO PLASTID" - } - .set { plastid_check } - - // - // SUBWORKFLOW: BLASTING FOR PLASTID ASSEMBLIES IN GENOME - // - PLASTID_ORGANELLAR_BLAST ( - YAML_INPUT.out.reference_tuple, - YAML_INPUT.out.plastid_var, - plastid_check.valid - ) - ch_versions = ch_versions.mix(PLASTID_ORGANELLAR_BLAST.out.versions) - - // - // SUBWORKFLOW: - // - RUN_FCSADAPTOR ( - YAML_INPUT.out.reference_tuple - ) - ch_versions = ch_versions.mix(RUN_FCSADAPTOR.out.versions) - - // - // SUBWORKFLOW: - // - RUN_FCSGX ( - YAML_INPUT.out.reference_tuple, - YAML_INPUT.out.fcs_gx_database_path, - YAML_INPUT.out.taxid, - YAML_INPUT.out.ncbi_rankedlineage_path - ) - ch_versions = ch_versions.mix(RUN_FCSADAPTOR.out.versions) - - // - // SUBWORKFLOW: IDENTITY PACBIO BARCODES IN INPUT DATA - // - /*PACBIO_BARCODE_CHECK ( - YAML_INPUT.out.reference_tuple, - YAML_INPUT.out.pacbio_tuple, - YAML_INPUT.out.pacbio_barcodes, - YAML_INPUT.out.pacbio_multiplex_codes - ) - ch_versions = ch_versions.mix(PACBIO_BARCODE_CHECK.out.versions)*/ - - // - // SUBWORKFLOW: CALCULATE AVERAGE READ COVERAGE - // - RUN_READ_COVERAGE ( - YAML_INPUT.out.reference_tuple, - YAML_INPUT.out.assembly_path, - YAML_INPUT.out.pacbio_tuple, - YAML_INPUT.out.reads_type - ) - ch_versions = ch_versions.mix(RUN_READ_COVERAGE.out.versions) - - // - // SUBWORKFLOW: COLLECT SOFTWARE VERSIONS - // - CUSTOM_DUMPSOFTWAREVERSIONS ( - ch_versions.unique().collectFile(name: 'collated_versions.yml') - ) - - emit: - software_ch = CUSTOM_DUMPSOFTWAREVERSIONS.out.yml - versions_ch = CUSTOM_DUMPSOFTWAREVERSIONS.out.versions -} - -/* -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - COMPLETION EMAIL AND SUMMARY -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -*/ - -workflow.onComplete { - if (params.email || params.email_on_fail) { - NfcoreTemplate.email(workflow, params, summary_params, projectDir, log) - } - NfcoreTemplate.summary(workflow, params, log) - if (params.hook_url) { - NfcoreTemplate.IM_notification(workflow, params, summary_params, projectDir, log) - } - // TreeValProject.summary(workflow, reference_tuple, summary_params, projectDir) - -} - -/* -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - THE END -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -*/ From 005c13e11444c225359e7f9ccb1955a6db91e674 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Fri, 17 Nov 2023 16:59:42 +0000 Subject: [PATCH 18/23] Missed a couple of comments --- workflows/ascc.nf | 5 ----- 1 file changed, 5 deletions(-) diff --git a/workflows/ascc.nf b/workflows/ascc.nf index c61f2b7..014b34a 100644 --- a/workflows/ascc.nf +++ b/workflows/ascc.nf @@ -77,12 +77,10 @@ workflow ASCC { // // MODULE: CALCULATE GC CONTENT PER SCAFFOLD IN INPUT FASTA // - /* GC_CONTENT ( YAML_INPUT.out.reference_tuple ) ch_versions = ch_versions.mix(GC_CONTENT.out.versions) - */ // // SUBWORKFLOW: GENERATE GENOME FILE @@ -96,13 +94,10 @@ workflow ASCC { // // SUBWORKFLOW: EXTRACT RESULTS HITS FROM TIARA // - - /* EXTRACT_TIARA_HITS ( GENERATE_GENOME.out.reference_tuple ) ch_versions = ch_versions.mix(EXTRACT_TIARA_HITS.out.versions) - */ // // LOGIC: INJECT SLIDING WINDOW VALUES INTO REFERENCE From 4669b12d8b80913e4b2406734d24a7fe470c174d Mon Sep 17 00:00:00 2001 From: yumisims Date: Fri, 17 Nov 2023 22:36:49 +0000 Subject: [PATCH 19/23] add wildcard to vecscreen output --- bin/parse_fcsgx_result.py | 79 +++++++++++------------ modules/local/filter_vecscreen_results.nf | 2 +- 2 files changed, 40 insertions(+), 41 deletions(-) diff --git a/bin/parse_fcsgx_result.py b/bin/parse_fcsgx_result.py index 5f3f8bc..bc15372 100755 --- a/bin/parse_fcsgx_result.py +++ b/bin/parse_fcsgx_result.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 """ Script for parsing the result files of FCS-GX, originally written by Eerik Aunin (ea10) -further minor modifications by Yumi Sims (yy5) +further refactoring/modifications by Yumi Sims (yy5) """ import general_purpose_functions as gpf @@ -42,51 +42,50 @@ def load_taxids_data(taxonomy_file): """ Parses the *.taxonomy.rpt to find taxids that correspond to species names. Returns this as a dictionary """ - taxonomy_data = gpf.l(taxonomy_file) - assert len(taxonomy_data) > 2 - taxonomy_data = taxonomy_data[2 : len(taxonomy_data)] + taxonomy_data = gpf.l(taxonomy_file)[2:] + assert taxonomy_data, "Taxonomy data is empty" + collection_dict = OrderedDict() for line in taxonomy_data: - add_new_entry = False - multiple_divs_per_scaff = False split_line = line.split("\t") assert len(split_line) == 34 - scaff = split_line[0] - if "~" in scaff: - scaff = scaff.split("~")[0] - tax_name_1 = split_line[5] - tax_id_1 = split_line[6] - div_1 = split_line[7] - cvg_by_div_1 = split_line[8] - if cvg_by_div_1 != "": - cvg_by_div_1 = int(cvg_by_div_1) - cvg_by_tax_1 = split_line[9] - if cvg_by_tax_1 != "": - cvg_by_tax_1 = int(cvg_by_tax_1) - score_1 = split_line[10] - if score_1 != "": - score_1 = int(score_1) - else: - score_1 = 0 - if scaff in collection_dict: - old_score = collection_dict[scaff]["fcs_gx_score"] - if old_score < score_1: - add_new_entry = True - multiple_divs_per_scaff = True - else: - add_new_entry = True - if add_new_entry is True: - row_dict = dict() - row_dict["fcs_gx_top_tax_name"] = tax_name_1 - row_dict["fcs_gx_top_taxid"] = tax_id_1 - row_dict["fcs_gx_div"] = div_1 - row_dict["fcs_gx_coverage_by_div"] = cvg_by_div_1 - row_dict["fcs_gx_coverage_by_tax"] = cvg_by_tax_1 - row_dict["fcs_gx_score"] = score_1 - row_dict["fcs_gx_multiple_divs_per_scaff"] = multiple_divs_per_scaff - row_dict["fcs_gx_action"] = "NA" + + scaff = split_line[0].split("~")[0] + tax_name_1, tax_id_1, div_1, cvg_by_div_1, cvg_by_tax_1, score_1 = ( + split_line[5], + split_line[6], + split_line[7], + int(split_line[8]) if split_line[8] else None, + int(split_line[9]) if split_line[9] else None, + int(split_line[10]) if split_line[10] else 0, + ) + + if scaff in collection_dict and collection_dict[scaff]["fcs_gx_score"] < score_1: + row_dict = { + "fcs_gx_top_tax_name": tax_name_1, + "fcs_gx_top_taxid": tax_id_1, + "fcs_gx_div": div_1, + "fcs_gx_coverage_by_div": cvg_by_div_1, + "fcs_gx_coverage_by_tax": cvg_by_tax_1, + "fcs_gx_score": score_1, + "fcs_gx_multiple_divs_per_scaff": True, + "fcs_gx_action": "NA", + } collection_dict[scaff] = row_dict + elif scaff not in collection_dict: + row_dict = { + "fcs_gx_top_tax_name": tax_name_1, + "fcs_gx_top_taxid": tax_id_1, + "fcs_gx_div": div_1, + "fcs_gx_coverage_by_div": cvg_by_div_1, + "fcs_gx_coverage_by_tax": cvg_by_tax_1, + "fcs_gx_score": score_1, + "fcs_gx_multiple_divs_per_scaff": False, + "fcs_gx_action": "NA", + } + collection_dict[scaff] = row_dict + return collection_dict diff --git a/modules/local/filter_vecscreen_results.nf b/modules/local/filter_vecscreen_results.nf index 9cb1b3b..cf1863e 100644 --- a/modules/local/filter_vecscreen_results.nf +++ b/modules/local/filter_vecscreen_results.nf @@ -11,7 +11,7 @@ process FILTER_VECSCREEN_RESULTS { tuple val(meta), path(vecscreen_outfile) output: - tuple val(meta), path("vecscreen.grepped.out"), emit: filtered_vecscreen_outfile + tuple val(meta), path("*vecscreen.grepped.out"), emit: filtered_vecscreen_outfile path "versions.yml" , emit: versions when: From 946cf0ed619730dc638cc0547e18a2f20c6f3287 Mon Sep 17 00:00:00 2001 From: yumisims Date: Fri, 17 Nov 2023 23:01:37 +0000 Subject: [PATCH 20/23] remove standard error output to pass github test --- bin/pacbio_barcode_check.py | 33 ++++++++++++++++++++++----------- 1 file changed, 22 insertions(+), 11 deletions(-) diff --git a/bin/pacbio_barcode_check.py b/bin/pacbio_barcode_check.py index 5103d5b..fe41264 100755 --- a/bin/pacbio_barcode_check.py +++ b/bin/pacbio_barcode_check.py @@ -13,6 +13,8 @@ Originally written by Eerik Aunin @eeaunin Adapted by Damon-Lee Pointon @DLBPointon + +Refactored by Yumi Sims """ from pathlib import Path @@ -45,19 +47,28 @@ def detect_barcodes_from_read_file_names(barcodes_fasta_path, pacbio_read_files) def check_if_barcodes_exist_in_barcodes_fasta(barcodes_list, barcodes_fasta_path): """ - Checks if the specified barcodes exist in the barcode sequences FASTA file, exits with an error message if a barcode is not found + Checks if the specified barcodes exist in the barcode sequences FASTA file, prints a message for each missing barcode. """ barcodes_fasta_data = gpf.l(barcodes_fasta_path) - barcode_names_in_fasta = [n.split(">")[1] for n in barcodes_fasta_data if n.startswith(">")] - for barcode in barcodes_list: - if barcode not in barcode_names_in_fasta: - sys.stderr.write( - f"The PacBio multiplexing barcode ({barcode}) was not found in the barcode sequences file ({barcodes_fasta_path})\n" - ) - sys.exit(1) - - # If this print statement is reached, all user-supplied codes are present. - print("The query barcodes exist in the barcodes database file") + barcode_names_in_fasta = [n.split(">")[1].strip() for n in barcodes_fasta_data if n.startswith(">")] + + missing_barcodes = [barcode for barcode in barcodes_list if barcode not in barcode_names_in_fasta] + + print( + "\n".join( + [ + f"Warning: The PacBio multiplexing barcode ({barcode}) was not found in the barcode sequences file ({barcodes_fasta_path})" + for barcode in missing_barcodes + ] + ) + ) + + if missing_barcodes: + print( + f"\nSummary: Some barcodes were not found in the barcode sequences file.\nMissing barcodes: {', '.join(missing_barcodes)}" + ) + else: + print("\nAll query barcodes exist in the barcode sequences file.") def main(barcodes_fasta_path, pacbio_read_files, pacbio_multiplexing_barcode_names): From 05612ddc40a1509951fc3ae7441ea3749bdf21a3 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Mon, 20 Nov 2023 13:05:57 +0000 Subject: [PATCH 21/23] Reverting changes to pacbio_barcode_check and adding barcodes found in test to test.yaml --- assets/test.yaml | 2 +- bin/pacbio_barcode_check.py | 33 +++++++++++---------------------- 2 files changed, 12 insertions(+), 23 deletions(-) diff --git a/assets/test.yaml b/assets/test.yaml index 2f3f9e1..e9eafa2 100755 --- a/assets/test.yaml +++ b/assets/test.yaml @@ -3,7 +3,7 @@ assembly_title: asccTinyTest reads_path: /lustre/scratch123/tol/resources/treeval/treeval-testdata/asccTinyTest/pacbio/ reads_type: "hifi" pacbio_barcodes: /nfs/treeoflife-01/teams/tola/users/dp24/ascc/assets/pacbio_adaptors.fa -pacbio_multiplexing_barcode_names: "bc1008,bc1009" +pacbio_multiplexing_barcode_names: "bc2008,bc2009" sci_name: "Plasmodium yoelii yoelii 17XNL" taxid: 352914 mito_fasta_path: /nfs/treeoflife-01/teams/tola/users/dp24/ascc/asccTinyTest/organellar/Pyoeliiyoelii17XNL_mitochondrion_ncbi.fa diff --git a/bin/pacbio_barcode_check.py b/bin/pacbio_barcode_check.py index fe41264..5103d5b 100755 --- a/bin/pacbio_barcode_check.py +++ b/bin/pacbio_barcode_check.py @@ -13,8 +13,6 @@ Originally written by Eerik Aunin @eeaunin Adapted by Damon-Lee Pointon @DLBPointon - -Refactored by Yumi Sims """ from pathlib import Path @@ -47,28 +45,19 @@ def detect_barcodes_from_read_file_names(barcodes_fasta_path, pacbio_read_files) def check_if_barcodes_exist_in_barcodes_fasta(barcodes_list, barcodes_fasta_path): """ - Checks if the specified barcodes exist in the barcode sequences FASTA file, prints a message for each missing barcode. + Checks if the specified barcodes exist in the barcode sequences FASTA file, exits with an error message if a barcode is not found """ barcodes_fasta_data = gpf.l(barcodes_fasta_path) - barcode_names_in_fasta = [n.split(">")[1].strip() for n in barcodes_fasta_data if n.startswith(">")] - - missing_barcodes = [barcode for barcode in barcodes_list if barcode not in barcode_names_in_fasta] - - print( - "\n".join( - [ - f"Warning: The PacBio multiplexing barcode ({barcode}) was not found in the barcode sequences file ({barcodes_fasta_path})" - for barcode in missing_barcodes - ] - ) - ) - - if missing_barcodes: - print( - f"\nSummary: Some barcodes were not found in the barcode sequences file.\nMissing barcodes: {', '.join(missing_barcodes)}" - ) - else: - print("\nAll query barcodes exist in the barcode sequences file.") + barcode_names_in_fasta = [n.split(">")[1] for n in barcodes_fasta_data if n.startswith(">")] + for barcode in barcodes_list: + if barcode not in barcode_names_in_fasta: + sys.stderr.write( + f"The PacBio multiplexing barcode ({barcode}) was not found in the barcode sequences file ({barcodes_fasta_path})\n" + ) + sys.exit(1) + + # If this print statement is reached, all user-supplied codes are present. + print("The query barcodes exist in the barcodes database file") def main(barcodes_fasta_path, pacbio_read_files, pacbio_multiplexing_barcode_names): From 79a804a3c0ef8c8dc77df4200d61d22e2d36cbfc Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Mon, 20 Nov 2023 13:11:29 +0000 Subject: [PATCH 22/23] Correct the other test.yaml for github testing --- assets/github_testing/test.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/assets/github_testing/test.yaml b/assets/github_testing/test.yaml index 7b35de1..c5a8bbf 100755 --- a/assets/github_testing/test.yaml +++ b/assets/github_testing/test.yaml @@ -1,7 +1,7 @@ assembly_path: /home/runner/work/ascc/ascc/asccTinyTest/assembly/Pyoeliiyoelii17XNL_assembly.fa assembly_title: asccTinyTest pacbio_barcodes: /home/runner/work/ascc/ascc/pacbio_barcode/pacbio_adaptors.fa -pacbio_multiplexing_barcode_names: "bc1008,bc1009" +pacbio_multiplexing_barcode_names: "bc2008,bc2009" reads_path: /home/runner/work/ascc/ascc/asccTinyTest/pacbio reads_type: "hifi" sci_name: "Plasmodium yoelii yoelii 17XNL" From afbbde76f75acafe6bf3c44917fa2a4921eae77d Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Mon, 20 Nov 2023 14:48:59 +0000 Subject: [PATCH 23/23] Fixed chunk assembly for vecscreen, added argparse for threshold variable --- bin/chunk_assembly_for_vecscreen.py | 44 ++++++++++--------- modules/local/chunk_assembly_for_vecscreen.nf | 3 +- 2 files changed, 25 insertions(+), 22 deletions(-) diff --git a/bin/chunk_assembly_for_vecscreen.py b/bin/chunk_assembly_for_vecscreen.py index e36d1ed..bf8068a 100755 --- a/bin/chunk_assembly_for_vecscreen.py +++ b/bin/chunk_assembly_for_vecscreen.py @@ -10,32 +10,33 @@ def process_record(record, chunk_num, threshold_length, overlap_length): - return ( - record[chunk_num * threshold_length : ((chunk_num + 1) * threshold_length + overlap_length)] - if chunk_num * threshold_length < len(record) - (threshold_length + overlap_length) - else record[chunk_num * threshold_length :] - ) + """ + ID chunks within a certain threshold, this sequence is then excised + If it falls outside of the calculated threshold then rest of sequence is output + """ + if chunk_num * threshold_length < len(record) - (threshold_length + overlap_length): + return record[chunk_num * threshold_length : ((chunk_num + 1) * threshold_length + overlap_length)] + else: + return record[chunk_num * threshold_length :] def generate_records_to_write(record, threshold_length, overlap_length): - return [ - ( - record_slice.id + ".chunk_{}".format(chunk_num + 1), - "", - record_slice, - ) - for chunk_num, record_slice in enumerate( - process_record(record, i, threshold_length, overlap_length) - for i in range((len(record) - 1) // threshold_length + 1) - ) - ] - - -def main(fasta_input_file, fasta_output_file): + """ + For record in input process the record + generate new id of id + counter + """ + counter = 0 + for i in range((len(record) - 1) // threshold_length + 1): + record_slice = process_record(record, i, threshold_length, overlap_length) + counter += 1 + record_slice.id = record_slice.id + ".chunk_{}".format(counter) + return record_slice + + +def main(fasta_input_file, fasta_output_file, threshold_length): fasta_input_file = os.path.abspath(fasta_input_file) fasta_output_file = os.path.abspath(fasta_output_file) - threshold_length = 500000 overlap_length = int(threshold_length / 10) minimum_record_size = 11 @@ -54,6 +55,7 @@ def main(fasta_input_file, fasta_output_file): parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("fasta_input_file", type=str, help="Path to input FASTA file") parser.add_argument("fasta_output_file", type=str, help="Path for FASTA output file") + parser.add_argument("--threshold", type=int, default=500000, help="Threshold length of sequence") parser.add_argument("-v", "--version", action="version", version="1.0") args = parser.parse_args() - main(args.fasta_input_file, args.fasta_output_file) + main(args.fasta_input_file, args.fasta_output_file, args.threshold) diff --git a/modules/local/chunk_assembly_for_vecscreen.nf b/modules/local/chunk_assembly_for_vecscreen.nf index 9969e42..bf7a657 100644 --- a/modules/local/chunk_assembly_for_vecscreen.nf +++ b/modules/local/chunk_assembly_for_vecscreen.nf @@ -19,8 +19,9 @@ process CHUNK_ASSEMBLY_FOR_VECSCREEN { script: def prefix = args.ext.prefix ?: "${meta.id}" + def args = args.ext.args ?: "" """ - chunk_assembly_for_vecscreen.py $fasta_input_file ${prefix}_chunked_assembly.fa + chunk_assembly_for_vecscreen.py $fasta_input_file ${prefix}_chunked_assembly.fa ${args} cat <<-END_VERSIONS > versions.yml "${task.process}":