Skip to content

Commit

Permalink
v0.7.0 of bettercallsal with QoL updates. Fixes #1.
Browse files Browse the repository at this point in the history
  • Loading branch information
biocoder committed Jan 2, 2024
1 parent e7330c6 commit 095e6ea
Show file tree
Hide file tree
Showing 20 changed files with 377 additions and 88 deletions.
17 changes: 11 additions & 6 deletions bin/dl_pdg_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,16 @@

# Kranti Konganti

import os
import shutil
import tempfile
import argparse
import inspect
import logging
import os
import re
from urllib.request import urlopen
import shutil
import ssl
import tempfile
from html.parser import HTMLParser
from urllib.request import urlopen

# Set logging.f
logging.basicConfig(
Expand Down Expand Up @@ -41,6 +42,10 @@ def dl_pdg(**kwargs) -> None:
"""
db_path, url, regex, suffix, overwrite, release = [kwargs[k] for k in kwargs.keys()]

contxt = ssl.create_default_context()
contxt.check_hostname = False
contxt.verify_mode = ssl.CERT_NONE

if (db_path or url) == None:
logging.error("Please provide absolute UNIX path\n" + "to store the result DB flat files.")
exit(1)
Expand All @@ -51,7 +56,7 @@ def dl_pdg(**kwargs) -> None:
html_parser = NCBIPathogensHTMLParser()
logging.info(f"Finding latest NCBI PDG release at:\n{url}")

with urlopen(url) as response:
with urlopen(url, context=contxt) as response:
with tempfile.NamedTemporaryFile(delete=False) as tmp_html_file:
shutil.copyfileobj(response, tmp_html_file)

Expand Down Expand Up @@ -86,7 +91,7 @@ def dl_pdg(**kwargs) -> None:
tsv_at = os.path.join(dest_dir, pdg_filename)
logging.info(f"Saving to:\n{tsv_at}")

with urlopen(pdg_metadata_url) as response:
with urlopen(pdg_metadata_url, context=contxt) as response:
with open(tsv_at, "w") as tsv:
tsv.writelines(response.read().decode("utf-8"))

Expand Down
170 changes: 149 additions & 21 deletions bin/get_top_unique_mash_hit_genomes.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,18 @@

# Kranti Konganti

import os
import glob
import pickle
import argparse
import glob
import inspect
import logging
import re
import os
import pickle
import pprint
import re
import subprocess
from collections import defaultdict


# Multiple inheritence for pretty printing of help text.
class MultiArgFormatClasses(argparse.RawTextHelpFormatter, argparse.ArgumentDefaultsHelpFormatter):
pass
Expand All @@ -20,17 +22,24 @@ class MultiArgFormatClasses(argparse.RawTextHelpFormatter, argparse.ArgumentDefa
# Main
def main() -> None:
"""
This script works only in the context of `bettercallsal` Nextflow workflow.
This script works only in the context of a Nextflow workflow.
It takes:
1. A pickle file containing a dictionary object where genome accession
is the key and the computed serotype is the value.
2. A file with `mash screen` results run against the Salmonella SNP
Cluster genomes' sketch.
OR
1. It takes a pickle file containing a nested dictionary, where genome accession
is the key and the metadata is a dictionary associated with that key.
2. A file with `mash screen` results.
3. A directory containing genomes' FASTA in gzipped format where the
FASTA file contains 2 lines: one FASTA header followed by
genome Sequence.
and then generates a concatenated FASTA file of top N unique `mash screen`
genome hits as requested.
In addition:
1. User can skip `mash screen` hits that originate from the supplied
bio project accessions.
For -skip option to work, ncbi-datasets should be available in $PATH.
"""

# Set logging.
Expand Down Expand Up @@ -109,12 +118,22 @@ def main() -> None:
"-n",
dest="num_uniq_hits",
default=10,
help="This many number of serotype genomes' accessions are " + "\nreturned.",
required=False,
help="This many number of serotype genomes' accessions are returned.",
)
parser.add_argument(
"-skip",
dest="skip_accs",
default=str(""),
required=False,
help="Skip all hits which belong to the following bioproject accession(s).\n"
+ "A comma separated list of more than one bioproject.",
)
parser.add_argument(
"-op",
dest="out_prefix",
default="MASH_SCREEN",
required=False,
help="Set the output file prefix for .fna.gz and .txt files.",
)
# required = parser.add_argument_group('required arguments')
Expand All @@ -129,12 +148,34 @@ def main() -> None:
genomes_dir = args.genomes_dir
genomes_dir_suffix = args.genomes_dir_suffix
out_prefix = args.out_prefix
skip_accs = args.skip_accs
skip_accs_list = list()
skip_check = re.compile(r"PRJNA\d+(?:\,PRJNA\d+){0,1}")
req_metadata = {
"mlst_sequence_type": "ST",
"epi_type": "ET",
"host": "HO",
"host_disease": "HD",
"isolation_source": "IS",
"outbreak": "OU",
"source_type": "SOT",
"strain": "GS",
}
target_acc_key = "target_acc"
ncbi_path_heading = "NCBI Pathogen Isolates Browser"
ncbi_path_uri = "https://www.ncbi.nlm.nih.gov/pathogens/isolates/#"
mash_genomes_gz = os.path.join(
os.getcwd(), out_prefix + "_TOP_" + str(num_uniq_hits) + "_UNIQUE_HITS.fna.gz"
)
mash_uniq_hits_txt = os.path.join(
os.getcwd(), re.sub(".fna.gz", ".txt", os.path.basename(mash_genomes_gz))
)
mash_uniq_accs_txt = os.path.join(
os.getcwd(), re.sub(".fna.gz", "_ACCS.txt", os.path.basename(mash_genomes_gz))
)
mash_popup_info_txt = os.path.join(
os.getcwd(), re.sub(".fna.gz", "_POPUP.txt", os.path.basename(mash_genomes_gz))
)

if mash_screen_res and os.path.exists(mash_genomes_gz):
logging.error(
Expand All @@ -153,6 +194,36 @@ def main() -> None:
logging.error("When -m is on, -ps and -gd are also required.")
exit(1)

if skip_accs and not skip_check.match(skip_accs):
logging.error(
"Supplied bio project accessions are not valid!\n"
+ "Valid options:\n\t-skip PRJNA766315\n\t-skip PRJNA766315,PRJNA675435"
)
exit(1)
elif skip_check.match(skip_accs):
datasets_cmd = "datasets summary genome accession --as-json-lines --report ids_only".split()
datasets_cmd.append(skip_accs)
dataformat_cmd = "dataformat tsv genome --fields accession --elide-header".split()
try:
accs_query = subprocess.run(datasets_cmd, capture_output=True, check=True)
try:
skip_accs_list = (
subprocess.check_output(dataformat_cmd, input=accs_query.stdout)
.decode("utf-8")
.split("\n")
)
except subprocess.CalledProcessError as e:
logging.error(f"Query failed\n\t{dataformat_cmd.join(' ')}\nError:\n\t{e}")
exit(1)
except subprocess.CalledProcessError as e:
logging.error(f"Query failed\n\t{datasets_cmd.join(' ')}\nError:\n\t{e}")
exit(1)

if len(skip_accs_list) > 0:
filter_these_hits = list(filter(bool, skip_accs_list))
else:
filter_these_hits = list()

if genomes_dir:
if not os.path.isdir(genomes_dir):
logging.error("UNIX path\n" + f"{genomes_dir}\n" + "does not exist!")
Expand Down Expand Up @@ -218,7 +289,6 @@ def main() -> None:
write_pickled_sero.close()
exit(0)
elif pickle_sero and not (os.path.exists(pickle_sero) and os.path.getsize(pickle_sero) > 0):

logging.error(
"Requested to create pickle from metadata, but\n"
+ f"the file, {os.path.basename(pickle_sero)} is empty or\ndoes not exist!"
Expand All @@ -227,7 +297,6 @@ def main() -> None:

if mash_screen_res and os.path.exists(mash_screen_res):
if os.path.getsize(mash_screen_res) > 0:

seen_uniq_hits = 0
unpickled_acc2serotype = pickle.load(file=open(pickled_sero, "rb"))

Expand Down Expand Up @@ -271,25 +340,84 @@ def main() -> None:

# ppp.pprint(mash_hits)
msh_out_txt = open(mash_uniq_hits_txt, "w")
wrote_header_pop = False
wrote_header_acc = False

with open(mash_genomes_gz, "wb") as msh_out_gz:
for _, (ident, acc_list) in enumerate(sorted(mash_hits.items(), reverse=True)):
for acc in acc_list:
if len(filter_these_hits) > 0 and acc in filter_these_hits:
continue
if seen_uniq_hits >= num_uniq_hits:
break
if unpickled_acc2serotype[acc] not in seen_mash_sero.keys():
seen_mash_sero[unpickled_acc2serotype[acc]] = 1
seen_uniq_hits += 1
# print(acc.strip() + '\t' + ident + '\t' + unpickled_acc2serotype[acc], file=sys.stdout)
msh_out_txt.write(
f"{acc.strip()}\t{unpickled_acc2serotype[acc]}\t{ident}\n"
if isinstance(unpickled_acc2serotype[acc], dict):
if target_acc_key in unpickled_acc2serotype[acc].keys():
if not wrote_header_pop:
mash_out_pop_txt = open(mash_popup_info_txt, "w")
mash_out_pop_txt.write("POPUP_INFO\nSEPARATOR COMMA\nDATA\n")
wrote_header_pop = True

pdt = "".join(unpickled_acc2serotype[acc][target_acc_key])

popup_line = ",".join(
[
acc,
ncbi_path_heading,
f'<a target="_blank" href="{ncbi_path_uri + pdt}">{pdt}</a>',
]
)
mash_out_pop_txt.write(popup_line + "\n")

if all(
k in unpickled_acc2serotype[acc].keys() for k in req_metadata.keys()
):
if not wrote_header_acc:
msh_out_accs_txt = open(mash_uniq_accs_txt, "w")
msh_out_txt.write("METADATA\nSEPARATOR COMMA\nFIELD_LABELS,")
msh_out_txt.write(
f"{','.join([str(key).upper() for key in req_metadata.keys()])}\nDATA\n"
)
wrote_header_acc = True

metadata_line = ",".join(
[
re.sub(
",",
"",
"|".join(unpickled_acc2serotype[acc][m]),
)
for m in req_metadata.keys()
],
)

msh_out_txt.write(f"{acc.strip()},{metadata_line}\n")
msh_out_accs_txt.write(
f"{os.path.join(genomes_dir, acc + genomes_dir_suffix)}\n"
)
with open(
os.path.join(genomes_dir, acc + genomes_dir_suffix), "rb"
) as msh_in_gz:
msh_out_gz.writelines(msh_in_gz.readlines())
msh_in_gz.close()
seen_mash_sero[acc] = 1
seen_uniq_hits += 1
elif not isinstance(unpickled_acc2serotype[acc], dict):
if unpickled_acc2serotype[acc] not in seen_mash_sero.keys():
seen_mash_sero[unpickled_acc2serotype[acc]] = 1
seen_uniq_hits += 1
# print(acc.strip() + '\t' + ident + '\t' + unpickled_acc2serotype[acc], file=sys.stdout)
msh_out_txt.write(
f"{acc.strip()}\t{unpickled_acc2serotype[acc]}\t{ident}\n"
)
with open(
os.path.join(genomes_dir, acc + genomes_dir_suffix),
"rb",
) as msh_in_gz:
msh_out_gz.writelines(msh_in_gz.readlines())
msh_in_gz.close()
msh_out_gz.close()
msh_out_txt.close()

if "msh_out_accs_txt" in locals().keys() and not msh_out_accs_txt.closed:
msh_out_accs_txt.close()
if "mash_out_pop_txt" in locals().keys() and not mash_out_pop_txt.closed:
mash_out_pop_txt.close()

logging.info(
f"File {os.path.basename(mash_genomes_gz)}\n"
+ f"written in:\n{os.getcwd()}\nDone! Bye!"
Expand Down
Loading

0 comments on commit 095e6ea

Please sign in to comment.