Skip to content

Commit

Permalink
Version, docs, message updates.
Browse files Browse the repository at this point in the history
  • Loading branch information
liberjul committed Nov 30, 2021
1 parent 5b9b325 commit 3dc184a
Show file tree
Hide file tree
Showing 10 changed files with 72 additions and 47 deletions.
53 changes: 33 additions & 20 deletions constax.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/bin/bash -login

VERSION=2.0.15; BUILD=0; PREFIX=placehold
VERSION=2.0.16; BUILD=0; PREFIX=placehold
TRAIN=false
BLAST=false
HELP=false
Expand Down Expand Up @@ -45,7 +45,7 @@ echo ""
echo "Please cite us as:"
echo "CONSTAX2: Improved taxonomic classification of environmental DNA markers"
echo "Julian Aaron Liber, Gregory Bonito, Gian Maria Niccolò Benucci"
echo "bioRxiv 2021.02.15.430803; doi: https://doi.org/10.1101/2021.02.15.430803"
echo "Bioinformatics, Volume 37, Issue 21, 1 November 2021, Pages 3941–3943; doi: https://doi.org/10.1093/bioinformatics/btab347"

### Parse variable inputs
TEMP=`getopt -o c:n:m:e:p:d:i:o:x:tbhvf: --long conf:,num_threads:,max_hits:,evalue:,p_iden:,db:,input:,output:,tax:,train,blast,select_by_keyword:,msu_hpcc,help,version,conservative,make_plot,check,trainfile:,mem:,sintax_path:,utax_path:,rdp_path:,constax_path:,pathfile:,isolates:,isolates_query_coverage:,isolates_percent_identity:,high_level_db:,high_level_query_coverage:,high_level_percent_identity: \
Expand Down Expand Up @@ -180,6 +180,10 @@ then
echo "CONSTAX version $VERSION build $BUILD"
exit 1
fi
#Check Python version
python -V > ver_python.txt 2>&1
if grep -Fq "Python 2" ver_python.txt; then exit 2; fi

if [ $MAX_HITS -eq 0 ]
then
echo "Set -m/--max_hits to an integer greater than zero."
Expand Down Expand Up @@ -302,20 +306,17 @@ then
source "$PATHFILE"
elif [ -f "pathfile.txt" ] # Next try in local directory
then
echo "Using local pathfile $PATHFILE"
source "$PATHFILE"
echo "Using local pathfile.txt"
source pathfile.txt
else # Then try in package directory.
echo "Pathfile input not found in local directory ..."
DIR=$(conda list | head -n 1 | rev | cut -d' ' -f1 | rev | cut -d: -f1)
PATHFILE=$DIR"/pkgs/constax-$VERSION-$BUILD/opt/constax-$VERSION/pathfile.txt"
if [ -f "$PATHFILE" ]
then
sed -i'' -e "s|=.*/opt/constax|=$DIR/pkgs/constax-$VERSION-$BUILD/opt/constax|g" "$PATHFILE" > "$PATHFILE".tmp
source "$PATHFILE".tmp
rm "$PATHFILE".tmp
else
echo "Pathfile input not found at $PATHFILE ..."
fi
if [ -f "$PATHFILE" ]; then source $PATHFILE; echo "Pathfile input found at $PATHFILE ..."; else echo "Pathfile input not found at $PATHFILE ..."; fi
PATHFILE=$DIR"/pkgs/constax-$VERSION-$BUILD_STRING/opt/constax-$VERSION/pathfile.txt"
if [ -f "$PATHFILE" ]; then source $PATHFILE; echo "Pathfile input found at $PATHFILE ..."; else echo "Pathfile input not found at $PATHFILE ..."; fi
PATHFILE=$DIR"/opt/constax-$VERSION/pathfile.txt"
if [ -f "$PATHFILE" ]; then source $PATHFILE; echo "Pathfile input found at $PATHFILE ..."; else echo "Pathfile input not found at $PATHFILE ..."; fi
fi
# Check for user input paths
if [ $(command -v "$SINTAXPATH_USER") ] && [[ "$SINTAXPATH_USER" != false ]]
Expand Down Expand Up @@ -363,13 +364,13 @@ else
echo "RDP: $RDPPATH"
if ! [ $(command -v java -jar "$RDPPATH") ] && ! [ $(command -v "$RDPPATH") ] ; then echo "RDP not executable alone or by java -jar" ; fi
echo "UTAX: $UTAXPATH"
if ! $BLAST && ! [ $(command -v "$UTAXPATH") ] ; then echo "UTAX not executable" ; fi
if ! $BLAST && ! [ $(command -v "$UTAXPATH") ] ; then echo "UTAX not executable. Did you mean to use -b/--blast flag?" ; fi
if $BLAST && ! [ $(command -v blastn) ] ; then echo "BLAST not executable" ; fi
echo "CONSTAX: $CONSTAXPATH"
if [ -d "$CONSTAXPATH" ] ; then echo "CONSTAX directory not found" ; fi
exit 1
fi
if ! $BLAST && [ $(echo "$UTAXPATH" | grep -oP "(?<=usearch).*?(?=\.)") -gt 9 ]
if ! $BLAST && [ $(echo "$UTAXPATH" | sed -e 's/.*usearch\([0-9]*\).*/\1/') -gt 9 ]
then
echo "USEARCH executable must be version 9.X or lower to use UTAX"
exit 1
Expand All @@ -384,7 +385,11 @@ fi
base=$(basename -- ${DB%.fasta})

FORMAT=$(python "$CONSTAXPATH"/detect_format.py -d "$DB" 2>&1)

if [[ $FORMAT == "INVALID" ]]
then
echo "Database file $DB must be in UNITE or SILVA format, exiting..."
exit 1
fi
echo "Memory size: "$MEM"mb"

if [[ "$FORMAT" == "null" ]]
Expand All @@ -397,7 +402,6 @@ then
echo "All checks passed, rerun without --check flag."
exit 0
fi

if ! $COMBINE_ONLY
then
if $TRAIN
Expand All @@ -406,7 +410,7 @@ then

echo "__________________________________________________________________________"
echo "Training SINTAX Classifier"
if [ $(echo "$SINTAXPATH" | grep -oP "(?<=usearch).*?(?=\.)") -lt 11 2> /dev/null ]
if [ $(echo "$SINTAXPATH" | sed -e 's/.*usearch\([0-9]*\).*/\1/') -lt 11 2> /dev/null ]
then
"$SINTAXPATH" -makeudb_sintax "${TFILES}/${base}"__UTAX.fasta -output ${TFILES}/sintax.db
else
Expand Down Expand Up @@ -459,15 +463,18 @@ then
exit 1
else
echo "RDP training error overcome, continuing with classification after SINTAX is retrained"
if [ $(echo "$SINTAXPATH" | grep -oP "(?<=usearch).*?(?=\.)") -lt 11 2> /dev/null ]
if [ $(echo "$SINTAXPATH" | sed -e 's/.*usearch\([0-9]*\).*/\1/') -lt 11 2> /dev/null ]
then
"$SINTAXPATH" -makeudb_sintax "${TFILES}/${base}"__UTAX.fasta -output ${TFILES}/sintax.db
else
"$SINTAXPATH" -makeudb_usearch "${TFILES}/${base}"__UTAX.fasta -output ${TFILES}/sintax.db
fi
fi
if [ -f rdp_train.out ]
then
rm rdp_train.out
fi
fi

# The rRNAClassifier.properties file should be in one of these two places
if [ -f "$CONSTAXPATH"/rRNAClassifier.properties ]
then
Expand Down Expand Up @@ -506,7 +513,6 @@ then
then
module load BLAST
fi

# workaround code for blast getting stuck
python "$CONSTAXPATH"/split_inputs.py -i "$FRM_INPUT"
echo > "$TAX"/blast.out
Expand Down Expand Up @@ -550,6 +556,11 @@ then
then
echo "High Level Taxonomy Assignment"
HL_FMT=$(python "$CONSTAXPATH"/detect_format.py -d "$HL_DB" 2>&1)
if [[ $HL_FMT == "INVALID" ]]
then
echo "High-level taxonomy database file $HL_DB must be in UNITE or SILVA format, exiting..."
exit 1
fi
if $MSU_HPCC && ! $BLAST
then
module load BLAST
Expand All @@ -559,6 +570,8 @@ then
rm "$TAX/"hl_formatted.fasta
blastn -query "$FRM_INPUT" -db "$TAX/$(basename -- ${HL_DB%.fasta})"__BLAST -num_threads $NTHREADS -outfmt "7 qacc sacc evalue bitscore pident qcovs" -max_target_seqs 1 -evalue 0.001 > "$TAX"/hl_blast.out
rm "$TAX/$(basename -- ${HL_DB%.fasta})"__BLAST.n*
else
echo ""
fi
rm "$FRM_INPUT"
fi
Expand Down
4 changes: 2 additions & 2 deletions constax_no_inputs.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/bin/bash -login

VERSION=2.0.15; BUILD=0; PREFIX=placehold
VERSION=2.0.16; BUILD=0; PREFIX=placehold

echo "Welcome to CONSTAX version $VERSION build $BUILD - The CONSensus TAXonomy classifier"
echo "This software is distributed under MIT License"
Expand All @@ -12,7 +12,7 @@ echo ""
echo "Please cite us as:"
echo "CONSTAX2: Improved taxonomic classification of environmental DNA markers"
echo "Julian Aaron Liber, Gregory Bonito, Gian Maria Niccolò Benucci"
echo "bioRxiv 2021.02.15.430803; doi: https://doi.org/10.1101/2021.02.15.430803"
echo "Bioinformatics, Volume 37, Issue 21, 1 November 2021, Pages 3941–3943; doi: https://doi.org/10.1093/bioinformatics/btab347"

if $SHOW_VERSION
then
Expand Down
4 changes: 2 additions & 2 deletions docs/source/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,11 @@
# -- Project information -----------------------------------------------------

project = 'CONSTAXv2'
copyright = '2020, Julian A. Liber and Gian M. N. Benucci'
copyright = '2021, Julian A. Liber and Gian M. N. Benucci'
author = 'Julian A. Liber and Gian M. N. Benucci'

# The full version, including alpha/beta/rc tags
release = '2.0.14'
release = '2.0.16'


# -- General configuration ---------------------------------------------------
Expand Down
4 changes: 2 additions & 2 deletions docs/source/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ taxonomic units at a given taxonomic rank) is realized
by generating consensus taxonomy of available classifiers
with CONSTAX.

CONSTAX 2.0.14 improves upon 1.0.0 with the following features:
CONSTAX 2.0.16 improves upon 1.0.0 with the following features:
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

* **Updated software requirements, including Python 3 and Java 8**
Expand Down Expand Up @@ -56,7 +56,7 @@ CONSTAX 1.0.0 was authored by
Reference
^^^^^^^^^

`Liber JA, Bonito G, Benucci GMN (2021) CONSTAX2: improved taxonomic classification of environmental DNA markers. Bioinformatics doi: 10.1093/bioinformatics/btab347 <https://doi.org/10.1093/bioinformatics/btab347>`_
`Liber JA, Bonito G, Benucci GMN (2021) CONSTAX2: improved taxonomic classification of environmental DNA markers. Bioinformatics doi: 10.1093/bioinformatics/btab347 <https://academic.oup.com/bioinformatics/article/37/21/3941/6271412>`_

`Gdanetz K, Benucci GMN, Vande Pol N, Bonito G (2017) CONSTAX: a tool for improved taxonomic resolution of environmental fungal ITS sequences. BMC Bioinformatics 18:538 doi 10.1186/s12859-017-1952-x <https://bmcbioinformatics.biomedcentral.com/track/pdf/10.1186/s12859-017-1952-x>`_

Expand Down
8 changes: 4 additions & 4 deletions docs/source/installation.rst
Original file line number Diff line number Diff line change
Expand Up @@ -25,15 +25,15 @@ If conda is not installed (you get an error which might include ``command not fo

.. code-block:: text
wget https://repo.anaconda.com/miniconda/Miniconda3-py39_4.9.2-Linux-x86_64.sh
bash Miniconda3-py39_4.9.2-Linux-x86_64.sh
wget https://repo.anaconda.com/miniconda/Miniconda3-py39_4.10.3-Linux-x86_64.sh
bash Miniconda3-py39_4.10.3-Linux-x86_64.sh
.. tab:: OSX

.. code-block:: text
curl -O https://repo.anaconda.com/miniconda/Miniconda3-py39_4.9.2-MacOSX-x86_64.sh
bash Miniconda3-py39_4.9.2-MacOSX-x86_64.sh
curl -O https://repo.anaconda.com/miniconda/Miniconda3-py39_4.10.3-MacOSX-x86_64.sh
bash Miniconda3-py39_4.10.3-MacOSX-x86_64.sh
2. Follow the prompts.

Expand Down
15 changes: 8 additions & 7 deletions docs/source/referenceDB.rst
Original file line number Diff line number Diff line change
Expand Up @@ -3,18 +3,19 @@
Suggested Reference Databases
=============================

Dependent on where your sequences originate (e.g. ITS, 16S, LSU), you will need to have an appropriate
Dependent on where your sequences originate (e.g. ITS, 16S, LSU), you will need to have an appropriate
database with which to classify them.

For Fungi or all Eukaryotes, the `UNITE <https://unite.ut.ee/>`_ database is preferred. The format of the reference database to use with
CONSTAX is one of those under the `General <https://unite.ut.ee/repository.php>`_ fasta format.
For Fungi or all Eukaryotes, the `UNITE <https://unite.ut.ee/>`_ database is preferred. The format of the reference database to use with
CONSTAX is one of those under the `General <https://unite.ut.ee/repository.php>`_ fasta format. For the latest release (10.05.2021),
training with 32GB of RAM for Fungi only or 40GB for all Eukaryotes should be sufficient.

For Bacteria and Archaea, we recommend the `SILVA <https://www.arb-silva.de/no_cache/download/archive/current/Exports/>`_ reference database.
For Bacteria and Archaea, we recommend the `SILVA <https://www.arb-silva.de/no_cache/download/archive/current/Exports/>`_ reference database.
The ``SILVA_XXX_SSURef_tax_silva.fasta.gz`` file can be gunzip-ped and used.

.. Note::
SILVA taxonomy is not assigned by Linnean ranks (*Kingdom*, *Phylum*, etc.), so instead placeholder ranks 1-n are used.
Also, the size of the SILVA database means that a server/cluster is required to train the classifier becasue
128GB RAM for the RDP training are required. If you have a computer with 32GB of RAM, you may be able to train using
SILVA taxonomy is not assigned by Linnean ranks (*Kingdom*, *Phylum*, etc.), so instead placeholder ranks 1-n are used.
Also, the size of the SILVA database means that a server/cluster is required to train the classifier becasue
128GB RAM for the RDP training are required. If you have a computer with 32GB of RAM, you may be able to train using
the UNITE database. If you cannot train locally for UNITE, the RDP files can be downloaded from `here <https://github.com/liberjul/CONSTAXv2_data/tree/master/sh_general_release_fungi_35077_RepS_04.02.2020>`_.
The ``genus_wordConditionalProbList.txt.gz`` file should be gunzip-ped after downloading.
2 changes: 1 addition & 1 deletion docs/source/tutorial1.rst
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ where you will add the absolute PATHs for the required software. VSEARCH, BLAST,
:align: center

.. warning::
Remember to navigate through your anaconda installation and find the ``constax-2.0.14/`` folder.
Remember to navigate through your anaconda installation and find the ``constax-2.0.16/`` folder.
This is the only way to make CONSTAX locate the needed python scripts.

Before you can run CONSTAX you need to activate your anaconda environment (alternatively,
Expand Down
2 changes: 1 addition & 1 deletion docs/source/tutorial2.rst
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ The code will look like as below
#SBATCH --time=10:00:00
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=20
#SBATCH --mem=128G
#SBATCH --mem=32G
#SBATCH --job-name constax_fungi
#SBACTH -A shade-cole-bonito
Expand Down
8 changes: 6 additions & 2 deletions docs/source/tutorial5.rst
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,11 @@ Downloading the UNITE database
This tutorial is about how to obtain a reference database for classification of fungi or
eukaryotes in general. These will be downloaded from `UNITE <https://unite.ut.ee/repository.php>`_.

For classification of fungi, we have had tested with the `RepS 35077 General Release FASTA <https://plutof.ut.ee/#/doi/10.15156/BIO/786368>`_.
For classification of fungi, we have had tested with the `RepS 44343 General Release FASTA <https://plutof.ut.ee/#/doi/10.15156/BIO/1280049>`_.

The eukaryote database with `96423 RepS sequences <https://doi.org/10.15156/BIO/1280127>`_ provides better information about the kingdom
classification of the sequence, but requires slightly more RAM (~40GB). Using the ``--high_level_taxonomy`` option can provide a similar result
but with reduced RAM requirements.

.. code-block:: language
Expand All @@ -14,6 +18,6 @@ For classification of fungi, we have had tested with the `RepS 35077 General Rel
Use the FASTA called ``sh_general_release_fungi_35077_RepS_04.02.2020.fasta`` within the expanded directory for
your fungal reference database, specified with ``-d`` or ``--db`` in your ``constax`` command.

For the ``--high_level_db`` option, the eukaryotes database found here `https://plutof.ut.ee/#/doi/10.15156/BIO/786370 <https://plutof.ut.ee/#/doi/10.15156/BIO/786370>`_.
For the ``--high_level_db`` option, the eukaryotes database found here `https://plutof.ut.ee/#/doi/10.15156/BIO/1280127 <https://plutof.ut.ee/#/doi/10.15156/BIO/1280127>`_.
can be used. This will help to remove non-fungal OTUs from your dataset, or can be used as the main database (``-d, --db``)
for projects amplifying other eukaryotes.
19 changes: 13 additions & 6 deletions subscript_fasta_addFullLineage.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,21 @@
#!/usr/bin/python
import sys, os

def RDP_head_to_UTAX(lineage):
def RDP_head_to_UTAX(lineage, format):
taxa = lineage.split(";")[1:]
out = ""
for r in range(len(taxa)):
out = F"{out}{'dkpcofgs'[r]}:{taxa[r]},"
if format == "UNITE":
out = F"{out}{'dkpcofg'[r]}:{taxa[r]},"
else:
out = F"{out}{'dkpcofgs'[r]}:{taxa[r]},"

return out[:-1]
def addFullLineage(filebase):
def addFullLineage(filebase, format):
print("\n\tAdding Full Lineage\n\n")
f1 = open(filebase+"__RDP_taxonomy_headers.txt", 'r').readlines()

with open(filebase+"__RDP_taxonomy_headers.txt", 'r') as f:
f1 = f.readlines()
hash = {} #lineage map

output_RDP = open(filebase+"__RDP_trained.fasta", 'w')
Expand All @@ -20,7 +26,8 @@ def addFullLineage(filebase):
ID = ID.strip(">")
hash[ID] = lineage

f2 = open(filebase+"__RDP.fasta", 'r').readlines()
with open(filebase+"__RDP.fasta", 'r') as f:
f2 = f.readlines()
for line in f2:
if line[0] == '>':
ID = line.strip().replace('>', '')
Expand All @@ -30,7 +37,7 @@ def addFullLineage(filebase):
print(ID, 'not in taxonomy file')
sys.exit()
output_RDP.write(F"{line.strip()}\t{lineage}\n")
output_UTAX.write(F"{line.strip()};tax={RDP_head_to_UTAX(lineage)};\n")
output_UTAX.write(F"{line.strip()};tax={RDP_head_to_UTAX(lineage, format)};\n")
else:
output_RDP.write(line.strip()+"\n")
output_UTAX.write(line.strip()+"\n")
Expand Down

0 comments on commit 3dc184a

Please sign in to comment.