Handling duplicate taxa in UNITE, version update to 2.0.7.

liberjul · Mar 4, 2021 · 13c9e69 · 13c9e69
1 parent 92f6898
commit 13c9e69
Show file tree

Hide file tree

Showing 7 changed files with 48 additions and 20 deletions.
diff --git a/FormatRefDB.py b/FormatRefDB.py
@@ -34,6 +34,7 @@ def convert_utax_line(utax_taxa):
 parser.add_argument("-t", "--tf", type=str, help="training files path")
 parser.add_argument("-f", "--format", type=str, help="database formatting")
 parser.add_argument("-p", "--path", type=str, help="path to subscript imports")
+parser.add_argument("--dup", action='store_true')
 args = parser.parse_args()
 
 sys.path.append(args.path + "/")
@@ -171,7 +172,7 @@ def convert_utax_line(utax_taxa):
 os.system(F"rm {filename_base}__RDP_taxonomy_trained.txt 2> /dev/null")
 os.system(F"rm {filename_base}__RDP_taxonomy_headers.txt 2> /dev/null")
 
-subscript_lineage2taxonomyTrain.lin2tax(filename_base, args.format)
+subscript_lineage2taxonomyTrain.lin2tax(filename_base, args.format, args.dup)
 subscript_fasta_addFullLineage.addFullLineage(filename_base, args.format)
 
 print("Database formatting complete\n____________________________________________________________________\n\n")
diff --git a/README.md b/README.md
@@ -106,7 +106,7 @@ python fasta_select_by_keyword.py -i SILVA_138_SSURef_tax_silva.fasta \
 constax --help
 ```
 ```
-Welcome to CONSTAX version 2.0.4 build 2 - The CONSensus TAXonomy classifier
+Welcome to CONSTAX version 2.0.7 build 0 - The CONSensus TAXonomy classifier
 This software is distributed under MIT License
 © Copyright 2020, Julian A. Liber, Gian M. N. Benucci & Gregory M. Bonito
 github.com/liberjul/CONSTAXv2

diff --git a/constax.sh b/constax.sh
@@ -1,6 +1,6 @@
 #!/bin/bash -login
 
-VERSION=2.0.6; BUILD=0
+VERSION=2.0.7; BUILD=0
 TRAIN=false
 BLAST=false
 HELP=false
@@ -427,9 +427,28 @@ then
 
   if [ $(command -v "$RDPPATH") ]
   then
-    "$RDPPATH" train -o "${TFILES}/." -s "${TFILES}/${base}"__RDP_trained.fasta -t "${TFILES}/${base}"__RDP_taxonomy_trained.txt -Xmx"$MEM"m
+    "$RDPPATH" train -o "${TFILES}/." -s "${TFILES}/${base}"__RDP_trained.fasta -t "${TFILES}/${base}"__RDP_taxonomy_trained.txt -Xmx"$MEM"m > rdp_train.out 2>&1
   else
-    java -Xmx"$MEM"m -jar "$RDPPATH" train -o "${TFILES}/." -s "${TFILES}/${base}"__RDP_trained.fasta -t "${TFILES}/${base}"__RDP_taxonomy_trained.txt
+    java -Xmx"$MEM"m -jar "$RDPPATH" train -o "${TFILES}/." -s "${TFILES}/${base}"__RDP_trained.fasta -t "${TFILES}/${base}"__RDP_taxonomy_trained.txt > rdp_train.out 2>&1
+  fi
+  cat rdp_train.out
+  if grep -Fq "duplicate taxon name" rdp_train.out
+  then
+    echo "RDP training error, redoing with duplicate taxa"
+    python "$CONSTAXPATH"/FormatRefDB.py -d "$DB" -t "$TFILES" -f $FORMAT -p "$CONSTAXPATH" --dup
+    if [ $(command -v "$RDPPATH") ]
+    then
+      "$RDPPATH" train -o "${TFILES}/." -s "${TFILES}/${base}"__RDP_trained.fasta -t "${TFILES}/${base}"__RDP_taxonomy_trained.txt -Xmx"$MEM"m > rdp_train.out 2>&1
+    else
+      java -Xmx"$MEM"m -jar "$RDPPATH" train -o "${TFILES}/." -s "${TFILES}/${base}"__RDP_trained.fasta -t "${TFILES}/${base}"__RDP_taxonomy_trained.txt > rdp_train.out 2>&1
+    fi
+    if [ -s rdp_train.out ]
+    then
+      cat rdp_train.out
+      exit 1
+    else
+      echo "RDP training error overcome, continuing with classification"
+    fi
   fi
 
   # The rRNAClassifier.properties file should be in one of these two places

diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -22,7 +22,7 @@
 author = 'Julian A. Liber and Gian M. N. Benucci'
 
 # The full version, including alpha/beta/rc tags
-release = '2.0.3'
+release = '2.0.7'
 
 
 # -- General configuration ---------------------------------------------------

diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -3,7 +3,7 @@ Welcome to CONSTAX's documentation!
 
 **CONSTAX** (*CONSensus TAXonomy*) is a tool, written in Python 3, for improved taxonomic resolution of environmental fungal ITS sequences. Briefly, CONSTAX compares the taxonomic classifications obtained from RDP Classifier, UTAX or BLAST, and SINTAX and merges them into an improved consensus taxonomy using a 2 out of 3 rule (e.g. If an OTU is classified as taxon A by RDP and UTAX/BLAST and taxon B by SINTAX, taxon A will be used in the consensus taxonomy) and the classification p-value to break the ties (e.g. when 3 different classification are obtained for the same OTU). This tool also produces summary classification outputs that are useful for downstream analyses. In summary, our results demonstrate that independent taxonomy assignment tools classify unique members of the fungal community, and greater classification power (proportion of assigned operational taxonomic units at a given taxonomic rank) is realized by generating consensus taxonomy of available classifiers with CONSTAX.
 
-CONSTAX 2.0.6 improves upon 1.0.0 with the following features:
+CONSTAX 2.0.7 improves upon 1.0.0 with the following features:
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 * **Updated software requirements, including Python 3 and Java 8**

diff --git a/docs/source/tutorial1.rst b/docs/source/tutorial1.rst
@@ -21,7 +21,7 @@ write the CONSTAX commans in it.
 .. code-block:: language
 
     gian@gian-Z390-GY:~/tutorial$ nano constax.sh
-    
+
 This is how the content of the ``.sh`` file should look like
 
 .. image:: images/script.png
@@ -31,7 +31,7 @@ This is how the content of the ``.sh`` file should look like
 
     Remember. If using a reference database for the first time, you will need to use the -t or **-\\-train** flag to train the classifiers on the dataset. The training step is necessary only at first use, you can just point to the **-\\-trainfile** <PATH> for the subsequent classifications with the same reference database.
 
-The ``--pathfile`` option is necessary ONLY if you are planning to use USEARCH instead of VSEARCH for your classification. In this case we suggestd to create a ``pathfile.txt`` 
+The ``--pathfile`` option is necessary ONLY if you are planning to use USEARCH instead of VSEARCH for your classification. In this case we suggestd to create a ``pathfile.txt``
 
 .. code-block:: language
 
@@ -43,19 +43,19 @@ where you will add the abosolute PATHs for the required softwares. VSEARCH, BLAS
    :align: center
 
 .. warning::
-    Remember to navigate through your anaconda installation and find the ``constax-2.0.3/`` folder.
+    Remember to navigate through your anaconda installation and find the ``constax-2.0.7/`` folder.
     This is the only way to make CONSTAX locate the needed python scripts.
 
 Before you can run CONSTAX you need to activate your anaconda environment (alternatively,
 you can include this in the constax.sh file).
 
 .. code-block:: language
-    
+
     gian@gian-Z390-GY:~/tutorial$ conda activate
 
-To see how to set up a conda environment with CONSTAX please refer to `this link <https://docs.conda.io/projects/conda/en/latest/user-guide/tasks/manage-environments.html>`_. 
+To see how to set up a conda environment with CONSTAX please refer to `this link <https://docs.conda.io/projects/conda/en/latest/user-guide/tasks/manage-environments.html>`_.
 
-At this point your are ready to give CONSTAX a try. 
+At this point your are ready to give CONSTAX a try.
 
 .. code-block:: language
 
@@ -71,8 +71,8 @@ When CONSTAX will be done you will see the outputs in the working directory.
 .. image:: images/results.png
    :align: center
 
-Training file and classification results will be stored in the specified folders. In this example 
-the training files will be in ``training_files`` 
+Training file and classification results will be stored in the specified folders. In this example
+the training files will be in ``training_files``
 
 .. image:: images/training.png
    :align: center
@@ -82,7 +82,7 @@ and the classification in ``taxonomy_assignments``
 .. image:: images/assign.png
    :align: center
 
-The taxonomic classification of your OTUs representative sequences will be in ``consensus_taxonomy.txt``. 
+The taxonomic classification of your OTUs representative sequences will be in ``consensus_taxonomy.txt``.
 
 .. image:: images/consensus.png
    :align: center
@@ -92,11 +92,9 @@ While classifiations perfomed by each classifier will be store in ``combined_tax
 .. image:: images/combined.png
    :align: center
 
-Please explore other CONSTAX outpus, such as ``Classification_Summary.txt``. 
+Please explore other CONSTAX outpus, such as ``Classification_Summary.txt``.
 
 If you want to use some test ``otus.fasta`` to practice the use of CONSTAX you can find some in `THIS <https://github.com/liberjul/CONSTAXv2/tree/master/otu_files>`_ github repo of CONSTAX.
 
 Now. We can try to run CONSTAX again changing some parameters to see some other options.
 For example, modify the ``constax.sh`` script as showed below.
-
-
diff --git a/subscript_lineage2taxonomyTrain.py b/subscript_lineage2taxonomyTrain.py
@@ -3,7 +3,7 @@
 #Approach:each taxon is uniquely identified by the combination of its tax id and depth from the root rank, its attributes comprise: name, parent taxid, and level of depth from the root rank.
 import os
 
-def lin2tax(file_base, format):
+def lin2tax(file_base, format, dup=False):
 	print("\n\tTraining Taxonomy")
 	with open(file_base+"__RDP_taxonomy.txt", 'r') as f:
 		line = f.readline()
@@ -20,6 +20,8 @@ def lin2tax(file_base, format):
 		line = f.readline()
 		if format == "UNITE":
 			name_to_end = {}
+			if dup:
+				end_name_dict = {}
 			while line != "":
 				rec_count = 0
 				th_buf = "" # taxon header buffer
@@ -50,6 +52,13 @@ def lin2tax(file_base, format):
 						ID += 1
 						hash[name] = ID #add name-id to the map
 						end_name = name.split(';')[-1]
+						if dup:
+							if end_name not in end_name_dict:
+								end_name_dict[end_name] = 1
+								end_name = F"{end_name}_1"
+							else:
+								end_name_dict[end_name] += 1
+								end_name = F"{end_name}_{end_name_dict[end_name]}"
 						header = F"{header};{end_name}"
 						name_to_end[name] = end_name
 						output_buf = F"{output_buf}{ID}*{end_name}*{pID}*{depth}*{rank}\n"
@@ -92,6 +101,7 @@ def lin2tax(file_base, format):
 						ID += 1
 						hash[name] = ID #add name-id to the map
 						end_name = name.split(';')[-1]
+						# Allow for taxa which have more than 1 parent lineage
 						if end_name not in end_name_dict:
 							end_name_dict[end_name] = 1
 							end_name = F"{end_name}_1"