diff --git a/bin/dl_pdg_metadata.py b/bin/dl_pdg_metadata.py index 697f0cf..63b196f 100755 --- a/bin/dl_pdg_metadata.py +++ b/bin/dl_pdg_metadata.py @@ -2,15 +2,16 @@ # Kranti Konganti -import os -import shutil -import tempfile import argparse import inspect import logging +import os import re -from urllib.request import urlopen +import shutil +import ssl +import tempfile from html.parser import HTMLParser +from urllib.request import urlopen # Set logging.f logging.basicConfig( @@ -41,6 +42,10 @@ def dl_pdg(**kwargs) -> None: """ db_path, url, regex, suffix, overwrite, release = [kwargs[k] for k in kwargs.keys()] + contxt = ssl.create_default_context() + contxt.check_hostname = False + contxt.verify_mode = ssl.CERT_NONE + if (db_path or url) == None: logging.error("Please provide absolute UNIX path\n" + "to store the result DB flat files.") exit(1) @@ -51,7 +56,7 @@ def dl_pdg(**kwargs) -> None: html_parser = NCBIPathogensHTMLParser() logging.info(f"Finding latest NCBI PDG release at:\n{url}") - with urlopen(url) as response: + with urlopen(url, context=contxt) as response: with tempfile.NamedTemporaryFile(delete=False) as tmp_html_file: shutil.copyfileobj(response, tmp_html_file) @@ -86,7 +91,7 @@ def dl_pdg(**kwargs) -> None: tsv_at = os.path.join(dest_dir, pdg_filename) logging.info(f"Saving to:\n{tsv_at}") - with urlopen(pdg_metadata_url) as response: + with urlopen(pdg_metadata_url, context=contxt) as response: with open(tsv_at, "w") as tsv: tsv.writelines(response.read().decode("utf-8")) diff --git a/bin/get_top_unique_mash_hit_genomes.py b/bin/get_top_unique_mash_hit_genomes.py index b66c048..a648fcc 100755 --- a/bin/get_top_unique_mash_hit_genomes.py +++ b/bin/get_top_unique_mash_hit_genomes.py @@ -2,16 +2,18 @@ # Kranti Konganti -import os -import glob -import pickle import argparse +import glob import inspect import logging -import re +import os +import pickle import pprint +import re +import subprocess from collections import defaultdict + # Multiple inheritence for pretty printing of help text. class MultiArgFormatClasses(argparse.RawTextHelpFormatter, argparse.ArgumentDefaultsHelpFormatter): pass @@ -20,17 +22,24 @@ class MultiArgFormatClasses(argparse.RawTextHelpFormatter, argparse.ArgumentDefa # Main def main() -> None: """ - This script works only in the context of `bettercallsal` Nextflow workflow. + This script works only in the context of a Nextflow workflow. It takes: 1. A pickle file containing a dictionary object where genome accession is the key and the computed serotype is the value. - 2. A file with `mash screen` results run against the Salmonella SNP - Cluster genomes' sketch. + OR + 1. It takes a pickle file containing a nested dictionary, where genome accession + is the key and the metadata is a dictionary associated with that key. + 2. A file with `mash screen` results. 3. A directory containing genomes' FASTA in gzipped format where the FASTA file contains 2 lines: one FASTA header followed by genome Sequence. and then generates a concatenated FASTA file of top N unique `mash screen` genome hits as requested. + + In addition: + 1. User can skip `mash screen` hits that originate from the supplied + bio project accessions. + For -skip option to work, ncbi-datasets should be available in $PATH. """ # Set logging. @@ -109,12 +118,22 @@ def main() -> None: "-n", dest="num_uniq_hits", default=10, - help="This many number of serotype genomes' accessions are " + "\nreturned.", + required=False, + help="This many number of serotype genomes' accessions are returned.", + ) + parser.add_argument( + "-skip", + dest="skip_accs", + default=str(""), + required=False, + help="Skip all hits which belong to the following bioproject accession(s).\n" + + "A comma separated list of more than one bioproject.", ) parser.add_argument( "-op", dest="out_prefix", default="MASH_SCREEN", + required=False, help="Set the output file prefix for .fna.gz and .txt files.", ) # required = parser.add_argument_group('required arguments') @@ -129,12 +148,34 @@ def main() -> None: genomes_dir = args.genomes_dir genomes_dir_suffix = args.genomes_dir_suffix out_prefix = args.out_prefix + skip_accs = args.skip_accs + skip_accs_list = list() + skip_check = re.compile(r"PRJNA\d+(?:\,PRJNA\d+){0,1}") + req_metadata = { + "mlst_sequence_type": "ST", + "epi_type": "ET", + "host": "HO", + "host_disease": "HD", + "isolation_source": "IS", + "outbreak": "OU", + "source_type": "SOT", + "strain": "GS", + } + target_acc_key = "target_acc" + ncbi_path_heading = "NCBI Pathogen Isolates Browser" + ncbi_path_uri = "https://www.ncbi.nlm.nih.gov/pathogens/isolates/#" mash_genomes_gz = os.path.join( os.getcwd(), out_prefix + "_TOP_" + str(num_uniq_hits) + "_UNIQUE_HITS.fna.gz" ) mash_uniq_hits_txt = os.path.join( os.getcwd(), re.sub(".fna.gz", ".txt", os.path.basename(mash_genomes_gz)) ) + mash_uniq_accs_txt = os.path.join( + os.getcwd(), re.sub(".fna.gz", "_ACCS.txt", os.path.basename(mash_genomes_gz)) + ) + mash_popup_info_txt = os.path.join( + os.getcwd(), re.sub(".fna.gz", "_POPUP.txt", os.path.basename(mash_genomes_gz)) + ) if mash_screen_res and os.path.exists(mash_genomes_gz): logging.error( @@ -153,6 +194,36 @@ def main() -> None: logging.error("When -m is on, -ps and -gd are also required.") exit(1) + if skip_accs and not skip_check.match(skip_accs): + logging.error( + "Supplied bio project accessions are not valid!\n" + + "Valid options:\n\t-skip PRJNA766315\n\t-skip PRJNA766315,PRJNA675435" + ) + exit(1) + elif skip_check.match(skip_accs): + datasets_cmd = "datasets summary genome accession --as-json-lines --report ids_only".split() + datasets_cmd.append(skip_accs) + dataformat_cmd = "dataformat tsv genome --fields accession --elide-header".split() + try: + accs_query = subprocess.run(datasets_cmd, capture_output=True, check=True) + try: + skip_accs_list = ( + subprocess.check_output(dataformat_cmd, input=accs_query.stdout) + .decode("utf-8") + .split("\n") + ) + except subprocess.CalledProcessError as e: + logging.error(f"Query failed\n\t{dataformat_cmd.join(' ')}\nError:\n\t{e}") + exit(1) + except subprocess.CalledProcessError as e: + logging.error(f"Query failed\n\t{datasets_cmd.join(' ')}\nError:\n\t{e}") + exit(1) + + if len(skip_accs_list) > 0: + filter_these_hits = list(filter(bool, skip_accs_list)) + else: + filter_these_hits = list() + if genomes_dir: if not os.path.isdir(genomes_dir): logging.error("UNIX path\n" + f"{genomes_dir}\n" + "does not exist!") @@ -218,7 +289,6 @@ def main() -> None: write_pickled_sero.close() exit(0) elif pickle_sero and not (os.path.exists(pickle_sero) and os.path.getsize(pickle_sero) > 0): - logging.error( "Requested to create pickle from metadata, but\n" + f"the file, {os.path.basename(pickle_sero)} is empty or\ndoes not exist!" @@ -227,7 +297,6 @@ def main() -> None: if mash_screen_res and os.path.exists(mash_screen_res): if os.path.getsize(mash_screen_res) > 0: - seen_uniq_hits = 0 unpickled_acc2serotype = pickle.load(file=open(pickled_sero, "rb")) @@ -271,25 +340,84 @@ def main() -> None: # ppp.pprint(mash_hits) msh_out_txt = open(mash_uniq_hits_txt, "w") + wrote_header_pop = False + wrote_header_acc = False + with open(mash_genomes_gz, "wb") as msh_out_gz: for _, (ident, acc_list) in enumerate(sorted(mash_hits.items(), reverse=True)): for acc in acc_list: + if len(filter_these_hits) > 0 and acc in filter_these_hits: + continue if seen_uniq_hits >= num_uniq_hits: break - if unpickled_acc2serotype[acc] not in seen_mash_sero.keys(): - seen_mash_sero[unpickled_acc2serotype[acc]] = 1 - seen_uniq_hits += 1 - # print(acc.strip() + '\t' + ident + '\t' + unpickled_acc2serotype[acc], file=sys.stdout) - msh_out_txt.write( - f"{acc.strip()}\t{unpickled_acc2serotype[acc]}\t{ident}\n" + if isinstance(unpickled_acc2serotype[acc], dict): + if target_acc_key in unpickled_acc2serotype[acc].keys(): + if not wrote_header_pop: + mash_out_pop_txt = open(mash_popup_info_txt, "w") + mash_out_pop_txt.write("POPUP_INFO\nSEPARATOR COMMA\nDATA\n") + wrote_header_pop = True + + pdt = "".join(unpickled_acc2serotype[acc][target_acc_key]) + + popup_line = ",".join( + [ + acc, + ncbi_path_heading, + f'{pdt}', + ] + ) + mash_out_pop_txt.write(popup_line + "\n") + + if all( + k in unpickled_acc2serotype[acc].keys() for k in req_metadata.keys() + ): + if not wrote_header_acc: + msh_out_accs_txt = open(mash_uniq_accs_txt, "w") + msh_out_txt.write("METADATA\nSEPARATOR COMMA\nFIELD_LABELS,") + msh_out_txt.write( + f"{','.join([str(key).upper() for key in req_metadata.keys()])}\nDATA\n" + ) + wrote_header_acc = True + + metadata_line = ",".join( + [ + re.sub( + ",", + "", + "|".join(unpickled_acc2serotype[acc][m]), + ) + for m in req_metadata.keys() + ], + ) + + msh_out_txt.write(f"{acc.strip()},{metadata_line}\n") + msh_out_accs_txt.write( + f"{os.path.join(genomes_dir, acc + genomes_dir_suffix)}\n" ) - with open( - os.path.join(genomes_dir, acc + genomes_dir_suffix), "rb" - ) as msh_in_gz: - msh_out_gz.writelines(msh_in_gz.readlines()) - msh_in_gz.close() + seen_mash_sero[acc] = 1 + seen_uniq_hits += 1 + elif not isinstance(unpickled_acc2serotype[acc], dict): + if unpickled_acc2serotype[acc] not in seen_mash_sero.keys(): + seen_mash_sero[unpickled_acc2serotype[acc]] = 1 + seen_uniq_hits += 1 + # print(acc.strip() + '\t' + ident + '\t' + unpickled_acc2serotype[acc], file=sys.stdout) + msh_out_txt.write( + f"{acc.strip()}\t{unpickled_acc2serotype[acc]}\t{ident}\n" + ) + with open( + os.path.join(genomes_dir, acc + genomes_dir_suffix), + "rb", + ) as msh_in_gz: + msh_out_gz.writelines(msh_in_gz.readlines()) + msh_in_gz.close() msh_out_gz.close() msh_out_txt.close() + + if "msh_out_accs_txt" in locals().keys() and not msh_out_accs_txt.closed: + msh_out_accs_txt.close() + if "mash_out_pop_txt" in locals().keys() and not mash_out_pop_txt.closed: + mash_out_pop_txt.close() + logging.info( f"File {os.path.basename(mash_genomes_gz)}\n" + f"written in:\n{os.getcwd()}\nDone! Bye!" diff --git a/bin/waterfall_per_computed_serotype.pl b/bin/waterfall_per_computed_serotype.pl index 9cc9ec2..7a8e4aa 100755 --- a/bin/waterfall_per_computed_serotype.pl +++ b/bin/waterfall_per_computed_serotype.pl @@ -1,7 +1,7 @@ #!/usr/bin/env perl # Kranti Konganti -# 09/14/2022 +# 01/02/2024 use strict; use warnings; @@ -20,6 +20,7 @@ $serovar_limit, $serovar_or_type_col, $min_asm_size, $complete_serotype_name, $PDG_file, $table_file, $not_null_pdg_serovar, $help, $out_prefix, + $acc_col, $seronamecol, $target_acc_col, @custom_serovars ); @@ -30,6 +31,9 @@ 'min_contig_size=i' => \$min_asm_size, 'complete_serotype_name' => \$complete_serotype_name, 'serotype_col:i' => \$serovar_or_type_col, + 'seronamecol:i' => \$seronamecol, + 'target_acc_col:i' => \$target_acc_col, + 'acc_col:i' => \$acc_col, 'not_null_pdg_serovar' => \$not_null_pdg_serovar, 'num_serotypes_per_serotype:i' => \$serovar_limit, 'include_serovar=s' => \@custom_serovars, @@ -41,7 +45,19 @@ } if ( !defined $serovar_or_type_col ) { - $serovar_or_type_col = 49; + $serovar_or_type_col = 50; +} + +if ( !defined $seronamecol ) { + $seronamecol = 34; +} + +if ( !defined $target_acc_col ) { + $target_acc_col = 43; +} + +if ( !defined $acc_col ) { + $acc_col = 10; } if ( !defined $min_asm_size ) { @@ -77,15 +93,15 @@ next if ( $line =~ m/^\#/ ); # Relevent columns (Perl index): - # 9: asm_acc - # 33: serovar - # 48: computed serotype + # 10-1 = 9: asm_acc + # 34 -1 = 33: serovar + # 50 -1 = 49: computed serotype my @cols = split( /\t/, $line ); my $serovar_or_type = $cols[ $serovar_or_type_col - 1 ]; - my $acc = $cols[9]; - my $serovar = $cols[33]; - my $target_acc = $cols[41]; + my $acc = $cols[ $acc_col - 1 ]; + my $serovar = $cols[ $seronamecol - 1 ]; + my $target_acc = $cols[ $target_acc_col - 1 ]; $serovar_or_type =~ s/\"//g; @@ -318,7 +334,23 @@ =head1 OPTIONS =item --serocol (Optional) Column number (non 0-based index) of the PDG metadata file -by which the serotypes are collected. Default: 49 +by which the serotypes are collected +(column name: "computed_types"). Default: 50 + +=item --seronamecol (Optional) + +Column number (non 0-based index) of the PDG metadata file +whose column name is "serovar". Default: 34 + +=item --acc_col (Optional) + +Column number (non 0-based index) of the PDG metadata file +whose column name is "acc". Default: 10 + +=item --target_acc_col (Optional) + +Column number (non 0-based index) of the PDG metadata file +whose column name is "target_acc". Default: 43 =item --complete_serotype_name (Optional) diff --git a/bin/waterfall_per_snp_cluster.pl b/bin/waterfall_per_snp_cluster.pl index b87a6d4..5667d22 100755 --- a/bin/waterfall_per_snp_cluster.pl +++ b/bin/waterfall_per_snp_cluster.pl @@ -1,7 +1,7 @@ #!/usr/bin/env perl # Kranti Konganti -# 08/23/2023 +# 01/02/2024 use strict; use warnings; @@ -23,7 +23,8 @@ $serovar_limit, $serovar_or_type_col, $min_asm_size, $complete_serotype_name, $PDG_file, $table_file, $not_null_pdg_serovar, $snp_cluster, $help, - $out_prefix + $out_prefix, $acc_col, $seronamecol, + $target_acc_col ); my @custom_serovars; @@ -35,6 +36,9 @@ 'min_contig_size=i' => \$min_asm_size, 'complete_serotype_name' => \$complete_serotype_name, 'serocol:i' => \$serovar_or_type_col, + 'seronamecol:i' => \$seronamecol, + 'target_acc_col:i' => \$target_acc_col, + 'acc_col:i' => \$acc_col, 'not_null_pdg_serovar' => \$not_null_pdg_serovar, 'num_serotypes_per_serotype:i' => \$serovar_limit, 'include_serovar=s' => \@custom_serovars, @@ -50,7 +54,19 @@ } if ( !defined $serovar_or_type_col ) { - $serovar_or_type_col = 49; + $serovar_or_type_col = 50; +} + +if ( !defined $seronamecol ) { + $seronamecol = 34; +} + +if ( !defined $target_acc_col ) { + $target_acc_col = 43; +} + +if ( !defined $acc_col ) { + $acc_col = 10; } if ( !defined $min_asm_size ) { @@ -90,15 +106,15 @@ next if ( $line =~ m/^\#/ ); # Relevent columns (Perl index): - # 9: asm_acc - # 33: serovar - # 48: computed serotype + # 10-1 = 9: asm_acc + # 34 -1 = 33: serovar + # 50 -1 = 49: computed serotype my @cols = split( /\t/, $line ); my $serovar_or_type = $cols[ $serovar_or_type_col - 1 ]; - my $acc = $cols[9]; - my $serovar = $cols[33]; - my $target_acc = $cols[41]; + my $acc = $cols[ $acc_col - 1 ]; + my $serovar = $cols[ $seronamecol - 1 ]; + my $target_acc = $cols[ $target_acc_col - 1 ]; $serovar_or_type =~ s/\"//g; @@ -370,11 +386,26 @@ =head1 OPTIONS Absolute UNIX path pointing to the SNP Cluster metadata file. Examples: PDG000000002.2505.reference_target.cluster_list.tsv - =item --serocol (Optional) Column number (non 0-based index) of the PDG metadata file -by which the serotypes are collected. Default: 49 +by which the serotypes are collected +(column name: "computed_types"). Default: 50 + +=item --seronamecol (Optional) + +Column number (non 0-based index) of the PDG metadata file +whose column name is "serovar". Default: 34 + +=item --acc_col (Optional) + +Column number (non 0-based index) of the PDG metadata file +whose column name is "acc". Default: 10 + +=item --target_acc_col (Optional) + +Column number (non 0-based index) of the PDG metadata file +whose column name is "target_acc". Default: 43 =item --complete_serotype_name (Optional) diff --git a/conf/computeinfra.config b/conf/computeinfra.config index 691063f..97dce78 100644 --- a/conf/computeinfra.config +++ b/conf/computeinfra.config @@ -43,7 +43,7 @@ raven { eprod { process.executor = 'slurm' - process.queue = 'lowmem,midmem,bigmem' + process.queue = 'combined' process.memory = '10GB' process.cpus = 4 params.enable_conda = false diff --git a/conf/manifest.config b/conf/manifest.config index 87edbe2..2b4d9b7 100644 --- a/conf/manifest.config +++ b/conf/manifest.config @@ -2,8 +2,8 @@ manifest { author = 'Kranti Konganti' homePage = 'https://xxxxxx.xxxxx.xxxx/Kranti.Konganti/cpipes' name = 'bettercallsal' - version = '0.6.1' - nextflowVersion = '>=22.10' + version = '0.7.0' + nextflowVersion = '>=23.04' description = 'Modular Nextflow pipelines at CFSAN, FDA.' doi = 'https://doi.org/10.3389/fmicb.2023.1200983' } \ No newline at end of file diff --git a/conf/modules.config b/conf/modules.config index b53128a..99601c9 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -15,6 +15,11 @@ process { } maxRetries = 1 + resourceLabels = {[ + process: task.process, + memoryRequested: task.memory.toString(), + cpusRequested: task.cpus.toString() + ]} withLabel: 'process_femto' { cpus = { 1 * task.attempt } diff --git a/lib/help/tuspy.nf b/lib/help/tuspy.nf index b8c4134..6f3e38d 100644 --- a/lib/help/tuspy.nf +++ b/lib/help/tuspy.nf @@ -59,6 +59,13 @@ def tuspyHelp(params) { " Default: ${params.tuspy_n}", cliflag: '-n', clivalue: (params.tuspy_n ?: '') + ], + 'tuspy_skip': [ + clihelp: 'Skip all hits which belong to the following bioproject ' + + 'accession(s). A comma separated list of more than one bioproject. ' + + " Default: ${params.tuspy_skip}", + cliflag: '-skip', + clivalue: (params.tuspy_skip ?: '') ] ] diff --git a/lib/help/wcomp.nf b/lib/help/wcomp.nf index 88e3df5..adca833 100644 --- a/lib/help/wcomp.nf +++ b/lib/help/wcomp.nf @@ -15,6 +15,27 @@ def wcompHelp(params) { cliflag: '--serocol', clivalue: (params.wcomp_serocol ?: '') ], + 'wcomp_seronamecol': [ + clihelp: 'Column number (non 0-based index) of the PDG metadata file whose column ' + + 'name is "serovar". ' + + " Default: ${params.wcomp_seronamecol}", + cliflag: '--seronamecol', + clivalue: (params.wcomp_seronamecol ?: '') + ], + 'wcomp_acc_col': [ + clihelp: 'Column number (non 0-based index) of the PDG metadata file whose column ' + + 'name is "acc". ' + + " Default: ${params.wcomp_seronamecol}", + cliflag: '--acc_col', + clivalue: (params.wcomp_acc_col ?: '') + ], + 'wcomp_target_acc_col': [ + clihelp: 'Column number (non 0-based index) of the PDG metadata file whose column ' + + 'name is "target_acc". ' + + " Default: ${params.wcomp_seronamecol}", + cliflag: '--target_acc_col', + clivalue: (params.wcomp_target_acc_col ?: '') + ], 'wcomp_complete_sero': [ clihelp: 'Skip indexing serotypes when the serotype name in the column ' + 'number 49 (non 0-based) of PDG metadata file consists a "-". For example, if ' + diff --git a/lib/help/wsnp.nf b/lib/help/wsnp.nf index 768cc08..2d1735e 100644 --- a/lib/help/wsnp.nf +++ b/lib/help/wsnp.nf @@ -15,6 +15,27 @@ def wsnpHelp(params) { cliflag: '--serocol', clivalue: (params.wsnp_serocol ?: '') ], + 'wsnp_seronamecol': [ + clihelp: 'Column number (non 0-based index) of the PDG metadata file whose column ' + + 'name is "serovar". ' + + " Default: ${params.wsnp_seronamecol}", + cliflag: '--seronamecol', + clivalue: (params.wsnp_seronamecol ?: '') + ], + 'wsnp_acc_col': [ + clihelp: 'Column number (non 0-based index) of the PDG metadata file whose column ' + + 'name is "acc". ' + + " Default: ${params.wsnp_seronamecol}", + cliflag: '--acc_col', + clivalue: (params.wsnp_acc_col ?: '') + ], + 'wsnp_target_acc_col': [ + clihelp: 'Column number (non 0-based index) of the PDG metadata file whose column ' + + 'name is "target_acc". ' + + " Default: ${params.wsnp_seronamecol}", + cliflag: '--target_acc_col', + clivalue: (params.wsnp_target_acc_col ?: '') + ], 'wsnp_complete_sero': [ clihelp: 'Skip indexing serotypes when the serotype name in the column ' + 'number 49 (non 0-based) of PDG metadata file consists a "-". For example, if ' + diff --git a/modules/db_per_computed_serotype/main.nf b/modules/db_per_computed_serotype/main.nf index bcddac1..082734a 100644 --- a/modules/db_per_computed_serotype/main.nf +++ b/modules/db_per_computed_serotype/main.nf @@ -28,7 +28,7 @@ process DB_PER_COMPUTED_SEROTYPE { """ waterfall_per_computed_serotype.pl \\ -p $pdg_metadata \\ - -t $accs_tbl \\ + -tb $accs_tbl \\ $args \\ 1> asm_chunk_comp.tbl 2> asm_chunk_comp_counts.tbl diff --git a/modules/db_per_snp_cluster/main.nf b/modules/db_per_snp_cluster/main.nf index 4611439..87f5bdf 100644 --- a/modules/db_per_snp_cluster/main.nf +++ b/modules/db_per_snp_cluster/main.nf @@ -29,7 +29,7 @@ process DB_PER_SNP_CLUSTER { """ waterfall_per_snp_cluster.pl \\ -p $pdg_metadata \\ - -t $accs_tbl \\ + -tb $accs_tbl \\ -snp $snp_cluster_metadata \\ $args \\ 1> asm_chunk_snp.tbl 2> asm_chunk_snp_counts.tbl diff --git a/modules/otf_genome/main.nf b/modules/otf_genome/main.nf index fed4a34..b1de685 100644 --- a/modules/otf_genome/main.nf +++ b/modules/otf_genome/main.nf @@ -36,4 +36,4 @@ process OTF_GENOME { python: \$( python --version | sed 's/Python //g' ) END_VERSIONS """ -} +} \ No newline at end of file diff --git a/modules/scaffold_genomes/main.nf b/modules/scaffold_genomes/main.nf index 13a3085..d4d8c2d 100644 --- a/modules/scaffold_genomes/main.nf +++ b/modules/scaffold_genomes/main.nf @@ -1,6 +1,6 @@ process SCAFFOLD_GENOMES { tag "fasta_join.pl" - label "process_micro" + label "process_nano" module (params.enable_module ? "${params.swmodulepath}${params.fs}perl${params.fs}5.30.0" : null) conda (params.enable_conda ? "conda-forge::perl bioconda::perl-bioperl=1.7.8" : null) diff --git a/modules/top_unique_serovars/main.nf b/modules/top_unique_serovars/main.nf index 7270754..8f91422 100644 --- a/modules/top_unique_serovars/main.nf +++ b/modules/top_unique_serovars/main.nf @@ -12,10 +12,12 @@ process TOP_UNIQUE_SEROVARS { tuple val(meta), path(mash_screen_res) output: - tuple val(meta), path('*_UNIQUE_HITS.txt') , emit: tsv, optional: true - tuple val(meta), path('*_UNIQUE_HITS.fna.gz'), emit: genomes_fasta, optional: true - path'*FAILED.txt' , emit: failed, optional: true - path 'versions.yml' , emit: versions + tuple val(meta), path('*_UNIQUE_HITS.txt') , emit: tsv, optional: true + tuple val(meta), path('*_UNIQUE_HITS_POPUP.txt'), emit: popup, optional: true + tuple val(meta), path('*_UNIQUE_HITS_ACCS.txt') , emit: accessions, optional: true + tuple val(meta), path('*_UNIQUE_HITS.fna.gz') , emit: genomes_fasta, optional: true + path'*FAILED.txt' , emit: failed, optional: true + path 'versions.yml' , emit: versions when: task.ext.when == null || task.ext.when @@ -32,6 +34,8 @@ process TOP_UNIQUE_SEROVARS { cat <<-END_VERSIONS > versions.yml "${task.process}": python: \$( python --version | sed 's/Python //g' ) + datasets: \$( datasets --version | sed 's/datasets version: //g' ) + dataformat: \$( dataformat version ) END_VERSIONS """ diff --git a/readme/bettercallsal.md b/readme/bettercallsal.md index 79a1b3d..eb2543e 100644 --- a/readme/bettercallsal.md +++ b/readme/bettercallsal.md @@ -28,7 +28,7 @@ ## Minimum Requirements -1. [Nextflow version 22.10.0](https://github.com/nextflow-io/nextflow/releases/download/v22.10.0/nextflow). +1. [Nextflow version 23.04.3](https://github.com/nextflow-io/nextflow/releases/download/v23.04.3/nextflow). - Make the `nextflow` binary executable (`chmod 755 nextflow`) and also make sure that it is made available in your `$PATH`. - If your existing `JAVA` install does not support the newest **Nextflow** version, you can try **Amazon**'s `JAVA` (OpenJDK): [Corretto](https://corretto.aws/downloads/latest/amazon-corretto-17-x64-linux-jdk.tar.gz). 2. Either of `micromamba` (version `1.0.0`) or `docker` or `singularity` installed and made available in your `$PATH`. @@ -87,7 +87,7 @@ cpipes --pipeline bettercallsal \ --input /path/to/illumina/fastq/dir \ --output /path/to/output \ - --bcs_root_dbdir /data/Kranti_Konganti/bettercallsal_db/PDG000000002.2727 + --bcs_root_dbdir /data/Kranti_Konganti/bettercallsal_db/PDG000000002.2876 ``` \ @@ -103,7 +103,7 @@ cpipes \ --pipeline bettercallsal \ --input /path/to/illumina/fastq/dir \ --output /path/to/output \ - --bcs_root_dbdir /data/Kranti_Konganti/bettercallsal_db/PDG000000002.2727 \ + --bcs_root_dbdir /data/Kranti_Konganti/bettercallsal_db/PDG000000002.2876 \ --fq_single_end false \ --fq_suffix '_R1_001.fastq.gz' ``` @@ -170,7 +170,7 @@ cpipes \ --pipeline bettercallsal \ --input /path/to/bettercallsal_sim_reads \ --output /path/to/bettercallsal_sim_reads_output \ - --bcs_root_dbdir /path/to/PDG000000002.2727 + --bcs_root_dbdir /path/to/PDG000000002.2876 --kmaalign_ignorequals \ --max_cpus 5 \ -profile stdkondagac \ @@ -271,7 +271,7 @@ After you make sure that you have all the [minimum requirements](#minimum-requir - Download simulated reads: [S3](https://cfsan-pub-xfer.s3.amazonaws.com/Kranti.Konganti/bettercallsal/bettercallsal_sim_reads.tar.bz2) (~ 3 GB). - Download pre-formatted test database: [S3](https://cfsan-pub-xfer.s3.amazonaws.com/Kranti.Konganti/bettercallsal/PDG000000002.2491.test-db.tar.bz2) (~ 75 MB). This test database works only with the simulated reads. -- Download pre-formatted full database (**Optional**): If you would like to do a complete run with your own **FASTQ** datasets, you can either create your own [database](./bettercallsal_db.md) or use [PDG000000002.2537](https://cfsan-pub-xfer.s3.amazonaws.com/Kranti.Konganti/bettercallsal/PDG000000002.2537.tar.bz2) version of the database (~ 37 GB). +- Download pre-formatted full database (**Optional**): If you would like to do a complete run with your own **FASTQ** datasets, you can either create your own [database](./bettercallsal_db.md) or use [PDG000000002.2727](https://cfsan-pub-xfer.s3.amazonaws.com/Kranti.Konganti/bettercallsal/PDG000000002.2727.tar.bz2) version of the database (~ 42 GB). - After succesful run of the workflow, your **MultiQC** report should look something like [this](https://cfsan-pub-xfer.s3.amazonaws.com/Kranti.Konganti/bettercallsal/bettercallsal_sim_reads_mqc.html). - It is always a best practice to use absolute UNIX paths and real destinations of symbolic links during pipeline execution. For example, find out the real path(s) of your absolute UNIX path(s) and use that for the `--input` and `--output` options of the pipeline. @@ -289,7 +289,7 @@ cpipes \ --pipeline bettercallsal \ --input /path/to/bettercallsal_sim_reads \ --output /path/to/bettercallsal_sim_reads_output \ - --bcs_root_dbdir /path/to/PDG000000002.2727 + --bcs_root_dbdir /path/to/PDG000000002.2876 --kmaalign_ignorequals \ -profile stdkondagac \ -resume @@ -326,9 +326,9 @@ Launching `./bettercallsal/cpipes` [awesome_chandrasekhar] DSL2 - revision: 8da4 -------------------------------------------------------------------------------- A collection of modular pipelines at CFSAN, FDA. -------------------------------------------------------------------------------- -Name : CPIPES +Name : bettercallsal Author : Kranti Konganti -Version : 0.6.1 +Version : 0.7.0 Center : CFSAN, FDA. ================================================================================ diff --git a/readme/bettercallsal_db.md b/readme/bettercallsal_db.md index 5118a86..958fe89 100644 --- a/readme/bettercallsal_db.md +++ b/readme/bettercallsal_db.md @@ -1,6 +1,6 @@ # bettercallsal_db -`bettercallsal_db` is an end-to-end automated workflow to generate and consolidate the required DB flat files based on [NCBI Pathogens Database for Salmonella](https://ftp.ncbi.nlm.nih.gov/pathogen/Results/Salmonella/). It first downloads the metadata based on the provided release identifier (Ex: `latest_snps` or `PDG000000002.2727`) and then creates a `mash sketch` based on the filtering strategy. It generates two types of sketches, one that prioritizes genome collection based on SNP clustering (`per_snp_cluster`) and the other just collects up to N number of genome accessions for each `computed_serotype` column from the metadata file (`per_computed_serotype`). +`bettercallsal_db` is an end-to-end automated workflow to generate and consolidate the required DB flat files based on [NCBI Pathogens Database for Salmonella](https://ftp.ncbi.nlm.nih.gov/pathogen/Results/Salmonella/). It first downloads the metadata based on the provided release identifier (Ex: `latest_snps` or `PDG000000002.2876`) and then creates a `mash sketch` based on the filtering strategy. It generates two types of sketches, one that prioritizes genome collection based on SNP clustering (`per_snp_cluster`) and the other just collects up to N number of genome accessions for each `computed_serotype` column from the metadata file (`per_computed_serotype`). The `bettercallsal_db` workflow should finish within an hour with stable internet connection. @@ -16,13 +16,13 @@ cpipes --pipeline bettercallsal_db [options] \   -Example: Run the `bettercallsal_db` pipeline and store output at `/data/Kranti_Konganti/bettercallsal_db/PDG000000002.2727`. +Example: Run the `bettercallsal_db` pipeline and store output at `/data/Kranti_Konganti/bettercallsal_db/PDG000000002.2876`. ```bash cpipes --pipeline bettercallsal_db \ - --pdg_release PDG000000002.2727 \ - --output /data/Kranti_Konganti/bettercallsal_db/PDG000000002.2727 + --pdg_release PDG000000002.2876 \ + --output /data/Kranti_Konganti/bettercallsal_db/PDG000000002.2876 ``` \ @@ -35,7 +35,7 @@ cpipes --pipeline bettercallsal \ --input /path/to/illumina/fastq/dir \ --output /path/to/output \ - --bcs_root_dbdir /data/Kranti_Konganti/bettercallsal_db/PDG000000002.2727 + --bcs_root_dbdir /data/Kranti_Konganti/bettercallsal_db/PDG000000002.2876 ``` \ @@ -67,7 +67,7 @@ A collection of modular pipelines at CFSAN, FDA. -------------------------------------------------------------------------------- Name : bettercallsal Author : Kranti Konganti -Version : 0.6.1 +Version : 0.7.0 Center : CFSAN, FDA. ================================================================================ @@ -75,7 +75,7 @@ Workflow : bettercallsal_db Author : Kranti Konganti -Version : 0.6.1 +Version : 0.7.0 Required : @@ -90,6 +90,18 @@ Other options : PDG metadata file by which the serotypes are collected. Default: false +--wcomp_seronamecol : Column number (non 0-based index) of the + PDG metadata file whose column name is " + serovar". Default: false + +--wcomp_acc_col : Column number (non 0-based index) of the + PDG metadata file whose column name is "acc + ". Default: false + +--wcomp_target_acc_col : Column number (non 0-based index) of the + PDG metadata file whose column name is " + target_acc". Default: false + --wcomp_complete_sero : Skip indexing serotypes when the serotype name in the column number 49 (non 0-based) of PDG metadata file consists a "-". For @@ -120,6 +132,18 @@ Other options : PDG metadata file by which the serotypes are collected. Default: false +--wsnp_seronamecol : Column number (non 0-based index) of the + PDG metadata file whose column name is " + serovar". Default: false + +--wsnp_acc_col : Column number (non 0-based index) of the + PDG metadata file whose column name is "acc + ". Default: false + +--wsnp_target_acc_col : Column number (non 0-based index) of the + PDG metadata file whose column name is " + target_acc". Default: false + --wsnp_complete_sero : Skip indexing serotypes when the serotype name in the column number 49 (non 0-based) of PDG metadata file consists a "-". For diff --git a/workflows/bettercallsal.nf b/workflows/bettercallsal.nf index 63254db..4fa22c1 100644 --- a/workflows/bettercallsal.nf +++ b/workflows/bettercallsal.nf @@ -394,18 +394,21 @@ workflow BETTERCALLSAL { .collectFile(name: 'collected_versions.yml') ) - DUMP_SOFTWARE_VERSIONS - .out - .mqc_yml - .concat ( - ch_multiqc, - BCS_RESULTS.out.mqc_yml, - BCS_RESULTS.out.mqc_json - ) - .collect() - .set { ch_multiqc } + if (params.multiqc_run) { + DUMP_SOFTWARE_VERSIONS + .out + .mqc_yml + .concat ( + ch_multiqc, + BCS_RESULTS.out.mqc_yml, + BCS_RESULTS.out.mqc_json + ) + .collect() + .set { ch_multiqc } + + MULTIQC ( ch_multiqc ) + } - MULTIQC ( ch_multiqc ) } /* diff --git a/workflows/conf/bettercallsal.config b/workflows/conf/bettercallsal.config index 89f2997..ab40363 100644 --- a/workflows/conf/bettercallsal.config +++ b/workflows/conf/bettercallsal.config @@ -1,8 +1,8 @@ params { workflow_conceived_by = 'Kranti Konganti' workflow_built_by = 'Kranti Konganti' - workflow_version = '0.6.1' - bcs_root_dbdir = '/tmp' + workflow_version = '0.7.0' + bcs_root_dbdir = '/hpc/db/bettercallsal/latest' bcs_db_mode = 'snp' bcs_db_mode_index = (params.bcs_db_mode ==~ /snp/ ? '_cluster' : '_serotype') bcs_thresholds = 'strict' @@ -106,6 +106,7 @@ params { + 'scaffold_genomes') tuspy_gds = '_scaffolded_genomic.fna.gz' tuspy_n = 10 + tuspy_skip = 'PRJNA766315,PRJNA675435,PRJNA831577,PRJNA855361' sourmashsketch_run = true sourmashsketch_mode = 'dna' sourmashsketch_file = false @@ -261,6 +262,7 @@ params { fq_filter_by_len = 0 fq_suffix = (params.fq_single_end ? '.fastq.gz' : '_R1_001.fastq.gz') fq2_suffix = '_R2_001.fastq.gz' + multiqc_run = true } /* diff --git a/workflows/conf/bettercallsal_db.config b/workflows/conf/bettercallsal_db.config index b0fe9d1..44bf558 100644 --- a/workflows/conf/bettercallsal_db.config +++ b/workflows/conf/bettercallsal_db.config @@ -1,16 +1,22 @@ params { workflow_conceived_by = 'Kranti Konganti' workflow_built_by = 'Kranti Konganti' - workflow_version = '0.6.1' - pdg_release = 'PDG000000002.2658' + workflow_version = '0.7.0' + pdg_release = 'PDG000000002.2876' ncbi_asm_suffix = '_genomic.fna.gz' wcomp_serocol = false + wcomp_seronamecol = false + wcomp_acc_col = false + wcomp_target_acc_col = false wcomp_complete_sero = false wcomp_not_null_serovar = false wcomp_num = false wcomp_min_contig_size = false wcomp_i = false wsnp_serocol = false + wsnp_seronamecol = false + wsnp_acc_col = false + wsnp_target_acc_col = false wsnp_complete_sero = true wsnp_not_null_serovar = false wsnp_num = false