Skip to content

Commit

Permalink
Merge pull request #45 from gavinmdouglas/fix_spf_higher
Browse files Browse the repository at this point in the history
Correct cases of intermediate unclassified and fix preceding unknown
  • Loading branch information
gavinmdouglas authored Jan 16, 2020
2 parents a8b0a5b + 12d5687 commit f36d238
Showing 1 changed file with 96 additions and 26 deletions.
122 changes: 96 additions & 26 deletions fix_spf.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,13 @@
import pandas as pd
import numpy as np
from os import path
import sys
import re
from tempfile import TemporaryDirectory

__author__ = "Gavin Douglas"
__license__ = "GPL"
__version__ = "0.2"
__version__ = "0.3"

parser = argparse.ArgumentParser(

Expand All @@ -29,7 +30,7 @@
"\"uncultured\", \"Ambiguous_taxa\", \"metagenome\", or "
"\"unidentified\", with \"Unclassified\". In addition, when this "
"option is set it will also replace labels containing \"unknown\" "
"with the preceeding label followed by \"X\". Note that these "
"with the preceeding label followed by \"_X\". Note that these "
"labels are all case-insensitive.",

epilog='''Usage example: fix_spf.py -i otu_table.spf -o otu_table_fixed.spf''',
Expand Down Expand Up @@ -57,12 +58,13 @@
"\"unknown\" with the preceeding label followed by "
"\"X\".")


def replace_ambig_labels(in_spf, out_spf, col_count):
'''Function to read in a SPF and to replace all labels containing
"uncultured", "Ambiguous_taxa", "metagenome", or
"unidentified", with "Unclassified". In addition, when this
option is set it will also replace labels containing "unknown
with the preceeding label followed by "X". Will write out the new SPF.'''
'''Function to read in a SPF and to replace all labels containing
"uncultured", "Ambiguous_taxa", "metagenome", or "unidentified", with
"Unclassified". In addition, when this option is set it will also replace
labels containing "unknown with the preceeding label followed by "X". Will
write out the new SPF.'''

outfile = open(out_spf, "w")
out_writer = csv.writer(outfile, delimiter="\t", lineterminator="\n")
Expand Down Expand Up @@ -104,7 +106,7 @@ def replace_ambig_labels(in_spf, out_spf, col_count):
# Loop through taxa and replace any ids in set of strings to
# replace with "Unclassified". For any taxa containing "unknown",
# replace these ids with the preceeding taxonomic level, but with
# the correct DX level and followed by "X".
# the correct DX level and followed by "_X".
out_taxa = []

# Loop over each label (from higher to lower levels).
Expand All @@ -114,20 +116,22 @@ def replace_ambig_labels(in_spf, out_spf, col_count):
for s in str2replace:
if s in label.lower():
str_match = True
break

if str_match:
out_taxa.append('Unclassified')

elif 'unknown' in label.lower():
pre_label_i = label_i - 1

if pre_label_i < 0 or taxa[pre_label_i] == 'Unclassified':
if pre_label_i < 0 or out_taxa[pre_label_i] == 'Unclassified':
out_taxa.append('Unclassified')
else:
pre_label_search = re.search("D_\d+__(.*)", taxa[pre_label_i])
pre_label_search = re.search(r"D_\d+__(.*)",
out_taxa[pre_label_i])
pre_label_taxon = pre_label_search.group(1)
label_level = re.match("D_\d+__", label).group(0)
out_taxa.append(label_level + pre_label_taxon + "X")
label_level = re.match(r"D_\d+__", label).group(0)
out_taxa.append(label_level + pre_label_taxon + "_X")

else:
out_taxa.append(label)
Expand All @@ -138,14 +142,74 @@ def replace_ambig_labels(in_spf, out_spf, col_count):
outfile.close()


def check_intermediate_unclassified(orig_spf, col_count):
'''Check whether any higher levels are 'Unclassified' while lower levels
are classified. Change label to be preceding with "X" for these levels or
alternatively throw an error if even the top level is 'Unclassified'.
Returns dataframe with either no changes if there are no intermediate
unclassified labels or with filled in labels. Note that col_count refers to
the number of taxonomic columns at the start of the file.'''

max_col_i = col_count - 1

# Making a copy of this dataframe because the pandas dataframe being iterated
# over should not be modified.
new_spf = orig_spf.copy()

for row_i, row in orig_spf.iterrows():

taxa = list(orig_spf.iloc[row_i, 0:max_col_i])

unclassified_i = []
classified_i = []

for i, t in enumerate(taxa):
if t == 'Unclassified':
unclassified_i.append(i)
else:
classified_i.append(i)

if unclassified_i and min(unclassified_i) < max(classified_i):

# Check whether first level is classified or not.
if taxa[0] == 'Unclassified':
sys.exit("Stopping - first level of this lineage is Unclassified, "
"but lower levels are classified:\n" + " ".join(taxa))

# For any cases of intermediate Unclassified labels, fill in the
# nearest classified parent label followed by X's equal to the number
# of steps away this classified label is.
for unclass_i in unclassified_i:

if unclass_i < max(classified_i):
current_i = unclass_i
unclassified_steps = 1

while current_i > 0:
current_i -= 1

if taxa[current_i] == 'Unclassified' or \
current_i not in classified_i:
unclassified_steps += 1
next
else:
taxa[unclass_i] = taxa[current_i] + "_" + "X" * unclassified_steps
break

new_spf.iloc[row_i, 0:max_col_i] = taxa

return(new_spf)


def force_strict_spf_hierarchy(in_spf, col_count):
'''Read through pandas df (of SPF file) and check that all levels form
a strict hierarchy. Returns fixed df with "_dupN" added to the end of
identical children with different parents.'''
a strict hierarchy. Returns fixed df with "_dupN" added to the end of
identical children with different parents. Note that col_count refers to
the number of taxonomic columns at the start of the file.'''

# Loop over all columns with labels.
for i in range(col_count):

# Skip first column.
if i == 0:
continue
Expand All @@ -154,30 +218,32 @@ def force_strict_spf_hierarchy(in_spf, col_count):
prior_col = in_spf.columns[i - 1]
unique_labels = in_spf[current_col].unique()

for l in unique_labels:
for label in unique_labels:

# Skip Unclassified labels.
if l == "Unclassified":
if label == "Unclassified":
continue

matching_rows = in_spf[current_col] == l
matching_rows = in_spf[current_col] == label

# Skip if only 1 matching row.
if np.sum(matching_rows) == 1:
continue

# Otherwise check whether all the parents are identical.
unique_parents = in_spf.loc[in_spf[current_col] == l, prior_col].unique()
# If multiple parents (i.e. not a strict hierarchy) then add "_dupN" to
# all children with different parents, where N is the parent's index
# in the above list.
unique_parents = in_spf.loc[in_spf[current_col] == label, prior_col].unique()

# If multiple parents (i.e. not a strict hierarchy) then add
# "_dupN" to all children with different parents, where N is the
# parent's index in the above list.
if len(unique_parents) > 1:
for idx, p in enumerate(unique_parents):
new_label = l + "_dup" + str(idx)
new_label = label + "_dup" + str(idx)

# Identify rows matching the label and parent value of interest.
rows2change = np.logical_and(matching_rows, in_spf[prior_col] == p)
# Identify rows matching the label and parent value of
# interest.
rows2change = np.logical_and(matching_rows,
in_spf[prior_col] == p)

in_spf.loc[rows2change, current_col] = new_label

Expand Down Expand Up @@ -205,6 +271,10 @@ def main():
# Check and add in strict hierarchy of SPF levels and write out table.
input_spf = force_strict_spf_hierarchy(input_spf, args.col_count)

# Check whether any higher levels are 'Unclassified' while lower
# levels are classified. Fill in different labels if needed.
input_spf = check_intermediate_unclassified(input_spf, args.col_count)

input_spf.to_csv(path_or_buf=args.output, sep='\t', header=True,
index=False)

Expand Down

0 comments on commit f36d238

Please sign in to comment.