Merge pull request #45 from gavinmdouglas/fix_spf_higher

Correct cases of intermediate unclassified and fix preceding unknown
LangilleLab · Jan 16, 2020 · f36d238 · f36d238
2 parents a8b0a5b + 12d5687
commit f36d238
Showing 1 changed file with 96 additions and 26 deletions.
diff --git a/fix_spf.py b/fix_spf.py
@@ -6,12 +6,13 @@
 import pandas as pd
 import numpy as np
 from os import path
+import sys
 import re
 from tempfile import TemporaryDirectory
 
 __author__ = "Gavin Douglas"
 __license__ = "GPL"
-__version__ = "0.2"
+__version__ = "0.3"
 
 parser = argparse.ArgumentParser(
 
@@ -29,7 +30,7 @@
             "\"uncultured\", \"Ambiguous_taxa\", \"metagenome\", or "
             "\"unidentified\", with \"Unclassified\". In addition, when this "
             "option is set it will also replace labels containing \"unknown\" "
-            "with the preceeding label followed by \"X\". Note that these "
+            "with the preceeding label followed by \"_X\". Note that these "
             "labels are all case-insensitive.",
 
 epilog='''Usage example: fix_spf.py -i otu_table.spf -o otu_table_fixed.spf''',
@@ -57,12 +58,13 @@
                          "\"unknown\" with the preceeding label followed by "
                          "\"X\".")
 
+
 def replace_ambig_labels(in_spf, out_spf, col_count):
-    '''Function to read in a SPF and to replace all labels containing 
-    "uncultured", "Ambiguous_taxa", "metagenome", or
-    "unidentified", with "Unclassified". In addition, when this 
-    option is set it will also replace labels containing "unknown
-    with the preceeding label followed by "X". Will write out the new SPF.'''
+    '''Function to read in a SPF and to replace all labels containing
+    "uncultured", "Ambiguous_taxa", "metagenome", or "unidentified", with
+    "Unclassified". In addition, when this option is set it will also replace
+    labels containing "unknown with the preceeding label followed by "X". Will
+    write out the new SPF.'''
 
     outfile = open(out_spf, "w")
     out_writer = csv.writer(outfile, delimiter="\t", lineterminator="\n")
@@ -104,7 +106,7 @@ def replace_ambig_labels(in_spf, out_spf, col_count):
             # Loop through taxa and replace any ids in set of strings to
             # replace with "Unclassified". For any taxa containing "unknown",
             # replace these ids with the preceeding taxonomic level, but with
-            # the correct DX level and followed by "X".
+            # the correct DX level and followed by "_X".
             out_taxa = []
 
             # Loop over each label (from higher to lower levels).
@@ -114,20 +116,22 @@ def replace_ambig_labels(in_spf, out_spf, col_count):
                 for s in str2replace:
                     if s in label.lower():
                         str_match = True
+                        break
 
                 if str_match:
                     out_taxa.append('Unclassified')
 
                 elif 'unknown' in label.lower():
                     pre_label_i = label_i - 1
 
-                    if pre_label_i < 0 or taxa[pre_label_i] == 'Unclassified':
+                    if pre_label_i < 0 or out_taxa[pre_label_i] == 'Unclassified':
                         out_taxa.append('Unclassified')
                     else:
-                        pre_label_search = re.search("D_\d+__(.*)", taxa[pre_label_i])
+                        pre_label_search = re.search(r"D_\d+__(.*)",
+                                                     out_taxa[pre_label_i])
                         pre_label_taxon = pre_label_search.group(1)
-                        label_level = re.match("D_\d+__", label).group(0)
-                        out_taxa.append(label_level + pre_label_taxon + "X")
+                        label_level = re.match(r"D_\d+__", label).group(0)
+                        out_taxa.append(label_level + pre_label_taxon + "_X")
 
                 else:
                     out_taxa.append(label)
@@ -138,14 +142,74 @@ def replace_ambig_labels(in_spf, out_spf, col_count):
     outfile.close()
 
 
+def check_intermediate_unclassified(orig_spf, col_count):
+    '''Check whether any higher levels are 'Unclassified' while lower levels
+    are classified. Change label to be preceding with "X" for these levels or
+    alternatively throw an error if even the top level is 'Unclassified'.
+    Returns dataframe with either no changes if there are no intermediate
+    unclassified labels or with filled in labels. Note that col_count refers to
+    the number of taxonomic columns at the start of the file.'''
+
+    max_col_i = col_count - 1
+
+    # Making a copy of this dataframe because the pandas dataframe being iterated
+    # over should not be modified.
+    new_spf = orig_spf.copy()
+
+    for row_i, row in orig_spf.iterrows():
+
+        taxa = list(orig_spf.iloc[row_i, 0:max_col_i])
+
+        unclassified_i = []
+        classified_i = []
+
+        for i, t in enumerate(taxa):
+            if t == 'Unclassified':
+                unclassified_i.append(i)
+            else:
+                classified_i.append(i)
+
+        if unclassified_i and min(unclassified_i) < max(classified_i):
+
+            # Check whether first level is classified or not.
+            if taxa[0] == 'Unclassified':
+                sys.exit("Stopping - first level of this lineage is Unclassified, "
+                         "but lower levels are classified:\n" + " ".join(taxa))
+
+            # For any cases of intermediate Unclassified labels, fill in the
+            # nearest classified parent label followed by X's equal to the number
+            # of steps away this classified label is.
+            for unclass_i in unclassified_i:
+
+                if unclass_i < max(classified_i):
+                    current_i = unclass_i
+                    unclassified_steps = 1
+
+                    while current_i > 0:
+                        current_i -= 1
+
+                        if taxa[current_i] == 'Unclassified' or \
+                           current_i not in classified_i:
+                            unclassified_steps += 1
+                            next
+                        else:
+                            taxa[unclass_i] = taxa[current_i] + "_" + "X" * unclassified_steps
+                            break
+
+        new_spf.iloc[row_i, 0:max_col_i] = taxa
+
+    return(new_spf)
+
+
 def force_strict_spf_hierarchy(in_spf, col_count):
     '''Read through pandas df (of SPF file) and check that all levels form
-    a strict hierarchy. Returns fixed df with "_dupN" added to the end of 
-    identical children with different parents.'''
+    a strict hierarchy. Returns fixed df with "_dupN" added to the end of
+    identical children with different parents. Note that col_count refers to
+    the number of taxonomic columns at the start of the file.'''
 
     # Loop over all columns with labels.
     for i in range(col_count):
-        
+
         # Skip first column.
         if i == 0:
             continue
@@ -154,30 +218,32 @@ def force_strict_spf_hierarchy(in_spf, col_count):
         prior_col = in_spf.columns[i - 1]
         unique_labels = in_spf[current_col].unique()
 
-        for l in unique_labels:
+        for label in unique_labels:
 
             # Skip Unclassified labels.
-            if l == "Unclassified":
+            if label == "Unclassified":
                 continue
 
-            matching_rows = in_spf[current_col] == l
+            matching_rows = in_spf[current_col] == label
 
             # Skip if only 1 matching row.
             if np.sum(matching_rows) == 1:
                 continue
 
             # Otherwise check whether all the parents are identical.
-            unique_parents = in_spf.loc[in_spf[current_col] == l, prior_col].unique()
-            
-            # If multiple parents (i.e. not a strict hierarchy) then add "_dupN" to
-            # all children with different parents, where N is the parent's index
-            # in the above list.
+            unique_parents = in_spf.loc[in_spf[current_col] == label, prior_col].unique()
+
+            # If multiple parents (i.e. not a strict hierarchy) then add
+            # "_dupN" to all children with different parents, where N is the
+            # parent's index in the above list.
             if len(unique_parents) > 1:
                 for idx, p in enumerate(unique_parents):
-                    new_label = l + "_dup" + str(idx)
+                    new_label = label + "_dup" + str(idx)
 
-                    # Identify rows matching the label and parent value of interest.
-                    rows2change = np.logical_and(matching_rows, in_spf[prior_col] == p)
+                    # Identify rows matching the label and parent value of
+                    # interest.
+                    rows2change = np.logical_and(matching_rows,
+                                                 in_spf[prior_col] == p)
 
                     in_spf.loc[rows2change, current_col] = new_label
 
@@ -205,6 +271,10 @@ def main():
     # Check and add in strict hierarchy of SPF levels and write out table.
     input_spf = force_strict_spf_hierarchy(input_spf, args.col_count)
 
+    # Check whether any higher levels are 'Unclassified' while lower
+    # levels are classified. Fill in different labels if needed.
+    input_spf = check_intermediate_unclassified(input_spf, args.col_count)
+
     input_spf.to_csv(path_or_buf=args.output, sep='\t', header=True,
                      index=False)