From 13c9e694d588e2110c3ca797daf27fb822260811 Mon Sep 17 00:00:00 2001
From: liberjul <liberjul@msu.edu>
Date: Thu, 4 Mar 2021 11:34:03 -0500
Subject: [PATCH] Handling duplicate taxa in UNITE, version update to 2.0.7.

---
 FormatRefDB.py                     |  3 ++-
 README.md                          |  2 +-
 constax.sh                         | 25 ++++++++++++++++++++++---
 docs/source/conf.py                |  2 +-
 docs/source/index.rst              |  2 +-
 docs/source/tutorial1.rst          | 22 ++++++++++------------
 subscript_lineage2taxonomyTrain.py | 12 +++++++++++-
 7 files changed, 48 insertions(+), 20 deletions(-)

diff --git a/FormatRefDB.py b/FormatRefDB.py
index cfc239a..ac9539a 100644
--- a/FormatRefDB.py
+++ b/FormatRefDB.py
@@ -34,6 +34,7 @@ def convert_utax_line(utax_taxa):
 parser.add_argument("-t", "--tf", type=str, help="training files path")
 parser.add_argument("-f", "--format", type=str, help="database formatting")
 parser.add_argument("-p", "--path", type=str, help="path to subscript imports")
+parser.add_argument("--dup", action='store_true')
 args = parser.parse_args()
 
 sys.path.append(args.path + "/")
@@ -171,7 +172,7 @@ def convert_utax_line(utax_taxa):
 os.system(F"rm {filename_base}__RDP_taxonomy_trained.txt 2> /dev/null")
 os.system(F"rm {filename_base}__RDP_taxonomy_headers.txt 2> /dev/null")
 
-subscript_lineage2taxonomyTrain.lin2tax(filename_base, args.format)
+subscript_lineage2taxonomyTrain.lin2tax(filename_base, args.format, args.dup)
 subscript_fasta_addFullLineage.addFullLineage(filename_base, args.format)
 
 print("Database formatting complete\n____________________________________________________________________\n\n")
diff --git a/README.md b/README.md
index 6d81257..21eb418 100644
--- a/README.md
+++ b/README.md
@@ -106,7 +106,7 @@ python fasta_select_by_keyword.py -i SILVA_138_SSURef_tax_silva.fasta \
 constax --help
 ```
 ```
-Welcome to CONSTAX version 2.0.4 build 2 - The CONSensus TAXonomy classifier
+Welcome to CONSTAX version 2.0.7 build 0 - The CONSensus TAXonomy classifier
 This software is distributed under MIT License
 © Copyright 2020, Julian A. Liber, Gian M. N. Benucci & Gregory M. Bonito
 github.com/liberjul/CONSTAXv2
diff --git a/constax.sh b/constax.sh
index 8250d1d..ec5423f 100644
--- a/constax.sh
+++ b/constax.sh
@@ -1,6 +1,6 @@
 #!/bin/bash -login
 
-VERSION=2.0.6; BUILD=0
+VERSION=2.0.7; BUILD=0
 TRAIN=false
 BLAST=false
 HELP=false
@@ -427,9 +427,28 @@ then
 
   if [ $(command -v "$RDPPATH") ]
   then
-    "$RDPPATH" train -o "${TFILES}/." -s "${TFILES}/${base}"__RDP_trained.fasta -t "${TFILES}/${base}"__RDP_taxonomy_trained.txt -Xmx"$MEM"m
+    "$RDPPATH" train -o "${TFILES}/." -s "${TFILES}/${base}"__RDP_trained.fasta -t "${TFILES}/${base}"__RDP_taxonomy_trained.txt -Xmx"$MEM"m > rdp_train.out 2>&1
   else
-    java -Xmx"$MEM"m -jar "$RDPPATH" train -o "${TFILES}/." -s "${TFILES}/${base}"__RDP_trained.fasta -t "${TFILES}/${base}"__RDP_taxonomy_trained.txt
+    java -Xmx"$MEM"m -jar "$RDPPATH" train -o "${TFILES}/." -s "${TFILES}/${base}"__RDP_trained.fasta -t "${TFILES}/${base}"__RDP_taxonomy_trained.txt > rdp_train.out 2>&1
+  fi
+  cat rdp_train.out
+  if grep -Fq "duplicate taxon name" rdp_train.out
+  then
+    echo "RDP training error, redoing with duplicate taxa"
+    python "$CONSTAXPATH"/FormatRefDB.py -d "$DB" -t "$TFILES" -f $FORMAT -p "$CONSTAXPATH" --dup
+    if [ $(command -v "$RDPPATH") ]
+    then
+      "$RDPPATH" train -o "${TFILES}/." -s "${TFILES}/${base}"__RDP_trained.fasta -t "${TFILES}/${base}"__RDP_taxonomy_trained.txt -Xmx"$MEM"m > rdp_train.out 2>&1
+    else
+      java -Xmx"$MEM"m -jar "$RDPPATH" train -o "${TFILES}/." -s "${TFILES}/${base}"__RDP_trained.fasta -t "${TFILES}/${base}"__RDP_taxonomy_trained.txt > rdp_train.out 2>&1
+    fi
+    if [ -s rdp_train.out ]
+    then
+      cat rdp_train.out
+      exit 1
+    else
+      echo "RDP training error overcome, continuing with classification"
+    fi
   fi
 
   # The rRNAClassifier.properties file should be in one of these two places
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 87f444c..b797165 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -22,7 +22,7 @@
 author = 'Julian A. Liber and Gian M. N. Benucci'
 
 # The full version, including alpha/beta/rc tags
-release = '2.0.3'
+release = '2.0.7'
 
 
 # -- General configuration ---------------------------------------------------
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 306b043..2a113dd 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -3,7 +3,7 @@ Welcome to CONSTAX's documentation!
 
 **CONSTAX** (*CONSensus TAXonomy*) is a tool, written in Python 3, for improved taxonomic resolution of environmental fungal ITS sequences. Briefly, CONSTAX compares the taxonomic classifications obtained from RDP Classifier, UTAX or BLAST, and SINTAX and merges them into an improved consensus taxonomy using a 2 out of 3 rule (e.g. If an OTU is classified as taxon A by RDP and UTAX/BLAST and taxon B by SINTAX, taxon A will be used in the consensus taxonomy) and the classification p-value to break the ties (e.g. when 3 different classification are obtained for the same OTU). This tool also produces summary classification outputs that are useful for downstream analyses. In summary, our results demonstrate that independent taxonomy assignment tools classify unique members of the fungal community, and greater classification power (proportion of assigned operational taxonomic units at a given taxonomic rank) is realized by generating consensus taxonomy of available classifiers with CONSTAX.
 
-CONSTAX 2.0.6 improves upon 1.0.0 with the following features:
+CONSTAX 2.0.7 improves upon 1.0.0 with the following features:
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 * **Updated software requirements, including Python 3 and Java 8**
diff --git a/docs/source/tutorial1.rst b/docs/source/tutorial1.rst
index 2a34e89..9dd5d43 100644
--- a/docs/source/tutorial1.rst
+++ b/docs/source/tutorial1.rst
@@ -21,7 +21,7 @@ write the CONSTAX commans in it.
 .. code-block:: language
 
     gian@gian-Z390-GY:~/tutorial$ nano constax.sh
-    
+
 This is how the content of the ``.sh`` file should look like
 
 .. image:: images/script.png
@@ -31,7 +31,7 @@ This is how the content of the ``.sh`` file should look like
 
     Remember. If using a reference database for the first time, you will need to use the -t or **-\\-train** flag to train the classifiers on the dataset. The training step is necessary only at first use, you can just point to the **-\\-trainfile** <PATH> for the subsequent classifications with the same reference database.
 
-The ``--pathfile`` option is necessary ONLY if you are planning to use USEARCH instead of VSEARCH for your classification. In this case we suggestd to create a ``pathfile.txt`` 
+The ``--pathfile`` option is necessary ONLY if you are planning to use USEARCH instead of VSEARCH for your classification. In this case we suggestd to create a ``pathfile.txt``
 
 .. code-block:: language
 
@@ -43,19 +43,19 @@ where you will add the abosolute PATHs for the required softwares. VSEARCH, BLAS
    :align: center
 
 .. warning::
-    Remember to navigate through your anaconda installation and find the ``constax-2.0.3/`` folder.
+    Remember to navigate through your anaconda installation and find the ``constax-2.0.7/`` folder.
     This is the only way to make CONSTAX locate the needed python scripts.
 
 Before you can run CONSTAX you need to activate your anaconda environment (alternatively,
 you can include this in the constax.sh file).
 
 .. code-block:: language
-    
+
     gian@gian-Z390-GY:~/tutorial$ conda activate
 
-To see how to set up a conda environment with CONSTAX please refer to `this link <https://docs.conda.io/projects/conda/en/latest/user-guide/tasks/manage-environments.html>`_. 
+To see how to set up a conda environment with CONSTAX please refer to `this link <https://docs.conda.io/projects/conda/en/latest/user-guide/tasks/manage-environments.html>`_.
 
-At this point your are ready to give CONSTAX a try. 
+At this point your are ready to give CONSTAX a try.
 
 .. code-block:: language
 
@@ -71,8 +71,8 @@ When CONSTAX will be done you will see the outputs in the working directory.
 .. image:: images/results.png
    :align: center
 
-Training file and classification results will be stored in the specified folders. In this example 
-the training files will be in ``training_files`` 
+Training file and classification results will be stored in the specified folders. In this example
+the training files will be in ``training_files``
 
 .. image:: images/training.png
    :align: center
@@ -82,7 +82,7 @@ and the classification in ``taxonomy_assignments``
 .. image:: images/assign.png
    :align: center
 
-The taxonomic classification of your OTUs representative sequences will be in ``consensus_taxonomy.txt``. 
+The taxonomic classification of your OTUs representative sequences will be in ``consensus_taxonomy.txt``.
 
 .. image:: images/consensus.png
    :align: center
@@ -92,11 +92,9 @@ While classifiations perfomed by each classifier will be store in ``combined_tax
 .. image:: images/combined.png
    :align: center
 
-Please explore other CONSTAX outpus, such as ``Classification_Summary.txt``. 
+Please explore other CONSTAX outpus, such as ``Classification_Summary.txt``.
 
 If you want to use some test ``otus.fasta`` to practice the use of CONSTAX you can find some in `THIS <https://github.com/liberjul/CONSTAXv2/tree/master/otu_files>`_ github repo of CONSTAX.
 
 Now. We can try to run CONSTAX again changing some parameters to see some other options.
 For example, modify the ``constax.sh`` script as showed below.
-
-
diff --git a/subscript_lineage2taxonomyTrain.py b/subscript_lineage2taxonomyTrain.py
index b2f8427..69108eb 100644
--- a/subscript_lineage2taxonomyTrain.py
+++ b/subscript_lineage2taxonomyTrain.py
@@ -3,7 +3,7 @@
 #Approach:each taxon is uniquely identified by the combination of its tax id and depth from the root rank, its attributes comprise: name, parent taxid, and level of depth from the root rank.
 import os
 
-def lin2tax(file_base, format):
+def lin2tax(file_base, format, dup=False):
 	print("\n\tTraining Taxonomy")
 	with open(file_base+"__RDP_taxonomy.txt", 'r') as f:
 		line = f.readline()
@@ -20,6 +20,8 @@ def lin2tax(file_base, format):
 		line = f.readline()
 		if format == "UNITE":
 			name_to_end = {}
+			if dup:
+				end_name_dict = {}
 			while line != "":
 				rec_count = 0
 				th_buf = "" # taxon header buffer
@@ -50,6 +52,13 @@ def lin2tax(file_base, format):
 						ID += 1
 						hash[name] = ID #add name-id to the map
 						end_name = name.split(';')[-1]
+						if dup:
+							if end_name not in end_name_dict:
+								end_name_dict[end_name] = 1
+								end_name = F"{end_name}_1"
+							else:
+								end_name_dict[end_name] += 1
+								end_name = F"{end_name}_{end_name_dict[end_name]}"
 						header = F"{header};{end_name}"
 						name_to_end[name] = end_name
 						output_buf = F"{output_buf}{ID}*{end_name}*{pID}*{depth}*{rank}\n"
@@ -92,6 +101,7 @@ def lin2tax(file_base, format):
 						ID += 1
 						hash[name] = ID #add name-id to the map
 						end_name = name.split(';')[-1]
+						# Allow for taxa which have more than 1 parent lineage
 						if end_name not in end_name_dict:
 							end_name_dict[end_name] = 1
 							end_name = F"{end_name}_1"