Skip to content

Commit

Permalink
Update to 2.0.15 fixing BLAST getting stuck on 0 hit queries.
Browse files Browse the repository at this point in the history
  • Loading branch information
liberjul committed Aug 23, 2021
1 parent 42c3780 commit d8d7f28
Show file tree
Hide file tree
Showing 7 changed files with 101 additions and 39 deletions.
69 changes: 36 additions & 33 deletions CombineTaxonomy.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,7 @@ def reformat_UTAX(utax_file, output_dir, confidence, ranks):

################################################################################
def reformat_SINTAX(sintax_file, output_dir, confidence, ranks):
input = open (sintax_file)
input = open(sintax_file)
all_lines = input.readlines()
input.close()

Expand Down Expand Up @@ -612,6 +612,7 @@ def str2bool(v):
print("\tDone\n")
if args.hl != "null":
print("\nReformatting high level tax result file\n")
print(F"User set query coverage minimum: {args.hl_qc} User set percent identity minimum: {args.hl_id}")
hl_dict = build_iso_hl_dict(F"{args.tax}/hl_blast.out", hl=True, hl_fmt=args.hl, hl_qc=args.hl_qc, hl_id=args.hl_id)
print("\tDone\n")
print("\nGenerating consensus taxonomy & combined taxonomy table\n")
Expand All @@ -629,41 +630,42 @@ def str2bool(v):
consensus.write("\n")

if args.blast:
combined = open(F"{args.output_dir}combined_taxonomy.txt", "w")
combined.write("OTU_ID\tKingdom_RDP\tKingdom_BLAST\tKingdom_SINTAX\tKingdom_Consensus\tPhylum_RDP\tPhylum_BLAST\tPhylum_SINTAX")
combined.write("\tPhylum_Consensus\tClass_RDP\tClass_BLAST\tClass_SINTAX\tClass_Consensus\tOrder_RDP\tOrder_BLAST\tOrder_SINTAX")
combined.write("\tOrder_Consensus\tFamily_RDP\tFamily_BLAST\tFamily_SINTAX\tFamily_Consensus\tGenus_RDP\tGenus_BLAST\tGenus_SINTAX")
combined.write("\tGenus_Consensus\tSpecies_RDP\tSpecies_BLAST\tSpecies_SINTAX\tSpecies_Consensus\n")

for otu in rdp_dict.keys():
consensus.write(otu+"\t")
combined.write(otu)
levels = []
for m in range(0,len(ranks)*2,2):
level = vote(rdp_dict[otu][m:m+2], blast_dict[otu][m:m+2], sin_dict[otu][m:m+2], args.conservative)
combined.write("\t"+rdp_dict[otu][m]+"\t"+blast_dict[otu][m]+"\t"+sin_dict[otu][m]+"\t")
if level != "":
levels.append(level)
combined.write(level)
consensus.write('\t'.join(levels+[""]*(len(ranks)-len(levels))))
if args.isolates == "True":
consensus.write(F"\t{iso_dict[otu][0]}\t{iso_dict[otu][1]}\t{iso_dict[otu][2]}")
if args.hl != "null":
consensus.write(F"\t{hl_dict[otu][0]}\t{hl_dict[otu][1]}\t{hl_dict[otu][2]}")
if args.consistent:
tax_string = '\t'.join(levels)
consensus.write(F"\t{int(tax_string.replace(' ', '_').strip('_') in taxa_set)}\n")
else:
consensus.write("\n")
combined.write("\n")
print("\tDone\n")
print(hl_dict)
combined = open(F"{args.output_dir}combined_taxonomy.txt", "w")
combined.write("OTU_ID\tKingdom_RDP\tKingdom_BLAST\tKingdom_SINTAX\tKingdom_Consensus\tPhylum_RDP\tPhylum_BLAST\tPhylum_SINTAX")
combined.write("\tPhylum_Consensus\tClass_RDP\tClass_BLAST\tClass_SINTAX\tClass_Consensus\tOrder_RDP\tOrder_BLAST\tOrder_SINTAX")
combined.write("\tOrder_Consensus\tFamily_RDP\tFamily_BLAST\tFamily_SINTAX\tFamily_Consensus\tGenus_RDP\tGenus_BLAST\tGenus_SINTAX")
combined.write("\tGenus_Consensus\tSpecies_RDP\tSpecies_BLAST\tSpecies_SINTAX\tSpecies_Consensus\n")

for otu in rdp_dict.keys():
consensus.write(otu+"\t")
combined.write(otu)
levels = []
for m in range(0,len(ranks)*2,2):
level = vote(rdp_dict[otu][m:m+2], blast_dict[otu][m:m+2], sin_dict[otu][m:m+2], args.conservative)
combined.write("\t"+rdp_dict[otu][m]+"\t"+blast_dict[otu][m]+"\t"+sin_dict[otu][m]+"\t")
if level != "":
levels.append(level)
combined.write(level)
consensus.write('\t'.join(levels+[""]*(len(ranks)-len(levels))))
if args.isolates == "True":
consensus.write(F"\t{iso_dict[otu][0]}\t{iso_dict[otu][1]}\t{iso_dict[otu][2]}")
if args.hl != "null":
consensus.write(F"\t{hl_dict[otu][0]}\t{hl_dict[otu][1]}\t{hl_dict[otu][2]}")
if args.consistent:
tax_string = '\t'.join(levels)
consensus.write(F"\t{int(tax_string.replace(' ', '_').strip('_') in taxa_set)}\n")
else:
consensus.write("\n")
combined.write("\n")
print("\tDone\n")

consensus.close()
combined.close()
consensus.close()
combined.close()


print("\nGenerating classification counts & summary table\n")
count_classifications([rdp_file, sin_file, blast_file, consensus_file], args.output_dir, "UNITE", len(ranks), use_blast=True)
print("\nGenerating classification counts & summary table\n")
count_classifications([rdp_file, sin_file, blast_file, consensus_file], args.output_dir, "UNITE", len(ranks), use_blast=True)
else:
combined = open(F"{args.output_dir}combined_taxonomy.txt", "w")
combined.write("OTU_ID\tKingdom_RDP\tKingdom_SINTAX\tKingdom_UTAX\tKingdom_Consensus\tPhylum_RDP\tPhylum_SINTAX\tPhylum_UTAX")
Expand Down Expand Up @@ -761,6 +763,7 @@ def str2bool(v):
print("\tDone\n")
if args.hl != "null":
print("\nReformatting high level tax result file\n")
print(F"User set query coverage minimum: {args.hl_qc} User set percent identity minimum: {args.hl_id}")
hl_dict = build_iso_hl_dict(F"{args.tax}/hl_blast.out", hl=True, hl_fmt=args.hl, hl_qc=args.hl_qc, hl_id=args.hl_id)
print("\tDone\n")
print("\nGenerating consensus taxonomy & combined taxonomy table\n")
Expand Down
2 changes: 1 addition & 1 deletion blast_to_df.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@
line = ifile.readline()
if line == "# 0 hits found\n": # If no hits found
buffer = F"{buffer}{quer},{'__'},{1},{0},{0.0},{0},{','.join(['unidentified']*len(ranks))}\n" # Add dummy row to be cut out later
elif line[0] != "#": # BLAST hit lines
elif line[0] != "#" and line != "\n": # BLAST hit lines
spl = line.strip().split("\t")
sub = spl[1]
eval, bitscore, id, qcov = spl[2:]
Expand Down
2 changes: 2 additions & 0 deletions check_input_names.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
#!/usr/bin/env python

'''
Script to convert invalid input names to compliant ASCII names. Uses a random
hex string for file name.
Expand Down
12 changes: 10 additions & 2 deletions constax.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/bin/bash -login

VERSION=2.0.14; BUILD=0; PREFIX=placehold
VERSION=2.0.15; BUILD=0; PREFIX=placehold
TRAIN=false
BLAST=false
HELP=false
Expand Down Expand Up @@ -507,7 +507,15 @@ then
module load BLAST
fi

blastn -query "$FRM_INPUT" -db "${TFILES}/${base}"__BLAST -num_threads $NTHREADS -outfmt "7 qacc sacc evalue bitscore pident qcovs" -max_target_seqs $MAX_HITS > "$TAX"/blast.out
# workaround code for blast getting stuck
python "$CONSTAXPATH"/split_inputs.py -i "$FRM_INPUT"
echo > "$TAX"/blast.out
for i in ${FRM_INPUT%.fasta}_*".fasta"
do
blastn -query $i -db "$TFILES"/"$base"__BLAST -num_threads $NTHREADS -outfmt "7 qacc sacc evalue bitscore pident qcovs" -max_target_seqs $MAX_HITS >> "$TAX"/blast.out
rm $i
done

python "$CONSTAXPATH"/blast_to_df.py -i "$TAX"/blast.out -o "$TAX"/otu_taxonomy.blast -d "$DB" -t "$TFILES" -f $FORMAT
else
"$UTAXPATH" -utax "$FRM_INPUT" -db "${TFILES}"/utax.db -strand both -utaxout "$TAX"/otu_taxonomy.utax -utax_cutoff $CONF -threads $NTHREADS
Expand Down
13 changes: 11 additions & 2 deletions constax_no_inputs.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/bin/bash -login

VERSION=2.0.14; BUILD=0; PREFIX=placehold
VERSION=2.0.15; BUILD=0; PREFIX=placehold

echo "Welcome to CONSTAX version $VERSION build $BUILD - The CONSensus TAXonomy classifier"
echo "This software is distributed under MIT License"
Expand Down Expand Up @@ -352,8 +352,15 @@ then
then
module load BLAST
fi
# workaround code for blast getting stuck
python "$CONSTAXPATH"/split_inputs.py -i "$FRM_INPUT"
echo > "$TAX"/blast.out
for i in ${FRM_INPUT%.fasta}_*".fasta"
do
blastn -query $i -db "$TFILES"/"$base"__BLAST -num_threads $NTHREADS -outfmt "7 qacc sacc evalue bitscore pident qcovs" -max_target_seqs $MAX_HITS >> "$TAX"/blast.out
rm $i
done

blastn -query "$FRM_INPUT" -db "${TFILES}/${base}"__BLAST -num_threads $NTHREADS -outfmt "7 qacc sacc evalue bitscore pident qcovs" -max_target_seqs $MAX_HITS > "$TAX"/blast.out
python "$CONSTAXPATH"/blast_to_df.py -i "$TAX"/blast.out -o "$TAX"/otu_taxonomy.blast -d "$DB" -t "$TFILES" -f $FORMAT
else
"$UTAXPATH" -utax "$FRM_INPUT" -db "${TFILES}"/utax.db -strand both -utaxout "$TAX"/otu_taxonomy.utax -utax_cutoff $CONF -threads $NTHREADS
Expand Down Expand Up @@ -402,6 +409,8 @@ then
rm "$TAX/"hl_formatted.fasta
blastn -query "$FRM_INPUT" -db "$TAX/$(basename -- ${HL_DB%.fasta})"__BLAST -num_threads $NTHREADS -outfmt "7 qacc sacc evalue bitscore pident qcovs" -max_target_seqs 1 -evalue 0.001 > "$TAX"/hl_blast.out
rm "$TAX/$(basename -- ${HL_DB%.fasta})"__BLAST.n*
else
echo ""
fi
rm "$FRM_INPUT"
fi
Expand Down
2 changes: 1 addition & 1 deletion constax_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ def false_to_null(arg):
env["RDPPATH_USER"]=args.rdp_path.lower() if args.rdp_path == "False" else args.rdp_path
env["CONSTAXPATH_USER"]=args.constax_path.lower() if args.constax_path == "False" else args.constax_path

version="2.0.14"; build="0"; prefix="placehold"
version="2.0.15"; build="0"; prefix="placehold"

if os.path.isfile(args.pathfile):
with open(args.pathfile, "r") as pathfile:
Expand Down
40 changes: 40 additions & 0 deletions split_inputs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
#!/usr/bin/env python

'''
Splits the formatted input/query FASTA into 100 sequence file to
work around BLAST getting stuck on 0 hit queries, where the
0 hit query is proceeded by >100 seqs.
'''
import argparse

parser = argparse.ArgumentParser()
parser.add_argument("-i", "--input", type=str, help="input query file")
args = parser.parse_args()

buffer = ""
total_rec_count = 0
file_rec_count = 0
file_count = 0
prefix = args.input.split(".fasta")[0]
print("Input FASTA: ", args.input)
with open(args.input, "r") as ifile:
line = ifile.readline()
while line != "":
header = line
line = ifile.readline()
seq = ""
while line != "" and line[0] != ">":
seq += line.strip()
line = ifile.readline()
buffer += F"{header}{seq}\n"
total_rec_count += 1
file_rec_count += 1
if file_rec_count > 99:
file_rec_count = 0
with open(F"{prefix}_{file_count:04d}.fasta", "w") as ofile:
ofile.write(buffer)
file_count += 1
buffer = ""
if buffer != "":
with open(F"{prefix}_{file_count:04d}.fasta", "w") as ofile:
ofile.write(buffer)

0 comments on commit d8d7f28

Please sign in to comment.