diff --git a/constax.sh b/constax.sh index 91b730e..4177031 100644 --- a/constax.sh +++ b/constax.sh @@ -1,6 +1,6 @@ #!/bin/bash -login -VERSION=2.0.15; BUILD=0; PREFIX=placehold +VERSION=2.0.16; BUILD=0; PREFIX=placehold TRAIN=false BLAST=false HELP=false @@ -45,7 +45,7 @@ echo "" echo "Please cite us as:" echo "CONSTAX2: Improved taxonomic classification of environmental DNA markers" echo "Julian Aaron Liber, Gregory Bonito, Gian Maria Niccolò Benucci" -echo "bioRxiv 2021.02.15.430803; doi: https://doi.org/10.1101/2021.02.15.430803" +echo "Bioinformatics, Volume 37, Issue 21, 1 November 2021, Pages 3941–3943; doi: https://doi.org/10.1093/bioinformatics/btab347" ### Parse variable inputs TEMP=`getopt -o c:n:m:e:p:d:i:o:x:tbhvf: --long conf:,num_threads:,max_hits:,evalue:,p_iden:,db:,input:,output:,tax:,train,blast,select_by_keyword:,msu_hpcc,help,version,conservative,make_plot,check,trainfile:,mem:,sintax_path:,utax_path:,rdp_path:,constax_path:,pathfile:,isolates:,isolates_query_coverage:,isolates_percent_identity:,high_level_db:,high_level_query_coverage:,high_level_percent_identity: \ @@ -180,6 +180,10 @@ then echo "CONSTAX version $VERSION build $BUILD" exit 1 fi +#Check Python version +python -V > ver_python.txt 2>&1 +if grep -Fq "Python 2" ver_python.txt; then exit 2; fi + if [ $MAX_HITS -eq 0 ] then echo "Set -m/--max_hits to an integer greater than zero." @@ -302,20 +306,17 @@ then source "$PATHFILE" elif [ -f "pathfile.txt" ] # Next try in local directory then - echo "Using local pathfile $PATHFILE" - source "$PATHFILE" + echo "Using local pathfile.txt" + source pathfile.txt else # Then try in package directory. echo "Pathfile input not found in local directory ..." DIR=$(conda list | head -n 1 | rev | cut -d' ' -f1 | rev | cut -d: -f1) PATHFILE=$DIR"/pkgs/constax-$VERSION-$BUILD/opt/constax-$VERSION/pathfile.txt" - if [ -f "$PATHFILE" ] - then - sed -i'' -e "s|=.*/opt/constax|=$DIR/pkgs/constax-$VERSION-$BUILD/opt/constax|g" "$PATHFILE" > "$PATHFILE".tmp - source "$PATHFILE".tmp - rm "$PATHFILE".tmp - else - echo "Pathfile input not found at $PATHFILE ..." - fi + if [ -f "$PATHFILE" ]; then source $PATHFILE; echo "Pathfile input found at $PATHFILE ..."; else echo "Pathfile input not found at $PATHFILE ..."; fi + PATHFILE=$DIR"/pkgs/constax-$VERSION-$BUILD_STRING/opt/constax-$VERSION/pathfile.txt" + if [ -f "$PATHFILE" ]; then source $PATHFILE; echo "Pathfile input found at $PATHFILE ..."; else echo "Pathfile input not found at $PATHFILE ..."; fi + PATHFILE=$DIR"/opt/constax-$VERSION/pathfile.txt" + if [ -f "$PATHFILE" ]; then source $PATHFILE; echo "Pathfile input found at $PATHFILE ..."; else echo "Pathfile input not found at $PATHFILE ..."; fi fi # Check for user input paths if [ $(command -v "$SINTAXPATH_USER") ] && [[ "$SINTAXPATH_USER" != false ]] @@ -363,13 +364,13 @@ else echo "RDP: $RDPPATH" if ! [ $(command -v java -jar "$RDPPATH") ] && ! [ $(command -v "$RDPPATH") ] ; then echo "RDP not executable alone or by java -jar" ; fi echo "UTAX: $UTAXPATH" - if ! $BLAST && ! [ $(command -v "$UTAXPATH") ] ; then echo "UTAX not executable" ; fi + if ! $BLAST && ! [ $(command -v "$UTAXPATH") ] ; then echo "UTAX not executable. Did you mean to use -b/--blast flag?" ; fi if $BLAST && ! [ $(command -v blastn) ] ; then echo "BLAST not executable" ; fi echo "CONSTAX: $CONSTAXPATH" if [ -d "$CONSTAXPATH" ] ; then echo "CONSTAX directory not found" ; fi exit 1 fi -if ! $BLAST && [ $(echo "$UTAXPATH" | grep -oP "(?<=usearch).*?(?=\.)") -gt 9 ] +if ! $BLAST && [ $(echo "$UTAXPATH" | sed -e 's/.*usearch\([0-9]*\).*/\1/') -gt 9 ] then echo "USEARCH executable must be version 9.X or lower to use UTAX" exit 1 @@ -384,7 +385,11 @@ fi base=$(basename -- ${DB%.fasta}) FORMAT=$(python "$CONSTAXPATH"/detect_format.py -d "$DB" 2>&1) - +if [[ $FORMAT == "INVALID" ]] +then + echo "Database file $DB must be in UNITE or SILVA format, exiting..." + exit 1 +fi echo "Memory size: "$MEM"mb" if [[ "$FORMAT" == "null" ]] @@ -397,7 +402,6 @@ then echo "All checks passed, rerun without --check flag." exit 0 fi - if ! $COMBINE_ONLY then if $TRAIN @@ -406,7 +410,7 @@ then echo "__________________________________________________________________________" echo "Training SINTAX Classifier" - if [ $(echo "$SINTAXPATH" | grep -oP "(?<=usearch).*?(?=\.)") -lt 11 2> /dev/null ] + if [ $(echo "$SINTAXPATH" | sed -e 's/.*usearch\([0-9]*\).*/\1/') -lt 11 2> /dev/null ] then "$SINTAXPATH" -makeudb_sintax "${TFILES}/${base}"__UTAX.fasta -output ${TFILES}/sintax.db else @@ -459,15 +463,18 @@ then exit 1 else echo "RDP training error overcome, continuing with classification after SINTAX is retrained" - if [ $(echo "$SINTAXPATH" | grep -oP "(?<=usearch).*?(?=\.)") -lt 11 2> /dev/null ] + if [ $(echo "$SINTAXPATH" | sed -e 's/.*usearch\([0-9]*\).*/\1/') -lt 11 2> /dev/null ] then "$SINTAXPATH" -makeudb_sintax "${TFILES}/${base}"__UTAX.fasta -output ${TFILES}/sintax.db else "$SINTAXPATH" -makeudb_usearch "${TFILES}/${base}"__UTAX.fasta -output ${TFILES}/sintax.db fi fi + if [ -f rdp_train.out ] + then + rm rdp_train.out + fi fi - # The rRNAClassifier.properties file should be in one of these two places if [ -f "$CONSTAXPATH"/rRNAClassifier.properties ] then @@ -506,7 +513,6 @@ then then module load BLAST fi - # workaround code for blast getting stuck python "$CONSTAXPATH"/split_inputs.py -i "$FRM_INPUT" echo > "$TAX"/blast.out @@ -550,6 +556,11 @@ then then echo "High Level Taxonomy Assignment" HL_FMT=$(python "$CONSTAXPATH"/detect_format.py -d "$HL_DB" 2>&1) + if [[ $HL_FMT == "INVALID" ]] + then + echo "High-level taxonomy database file $HL_DB must be in UNITE or SILVA format, exiting..." + exit 1 + fi if $MSU_HPCC && ! $BLAST then module load BLAST @@ -559,6 +570,8 @@ then rm "$TAX/"hl_formatted.fasta blastn -query "$FRM_INPUT" -db "$TAX/$(basename -- ${HL_DB%.fasta})"__BLAST -num_threads $NTHREADS -outfmt "7 qacc sacc evalue bitscore pident qcovs" -max_target_seqs 1 -evalue 0.001 > "$TAX"/hl_blast.out rm "$TAX/$(basename -- ${HL_DB%.fasta})"__BLAST.n* + else + echo "" fi rm "$FRM_INPUT" fi diff --git a/constax_no_inputs.sh b/constax_no_inputs.sh index ab831e5..0466a04 100644 --- a/constax_no_inputs.sh +++ b/constax_no_inputs.sh @@ -1,6 +1,6 @@ #!/bin/bash -login -VERSION=2.0.15; BUILD=0; PREFIX=placehold +VERSION=2.0.16; BUILD=0; PREFIX=placehold echo "Welcome to CONSTAX version $VERSION build $BUILD - The CONSensus TAXonomy classifier" echo "This software is distributed under MIT License" @@ -12,7 +12,7 @@ echo "" echo "Please cite us as:" echo "CONSTAX2: Improved taxonomic classification of environmental DNA markers" echo "Julian Aaron Liber, Gregory Bonito, Gian Maria Niccolò Benucci" -echo "bioRxiv 2021.02.15.430803; doi: https://doi.org/10.1101/2021.02.15.430803" +echo "Bioinformatics, Volume 37, Issue 21, 1 November 2021, Pages 3941–3943; doi: https://doi.org/10.1093/bioinformatics/btab347" if $SHOW_VERSION then diff --git a/docs/source/conf.py b/docs/source/conf.py index 7728251..bfb56b1 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -18,11 +18,11 @@ # -- Project information ----------------------------------------------------- project = 'CONSTAXv2' -copyright = '2020, Julian A. Liber and Gian M. N. Benucci' +copyright = '2021, Julian A. Liber and Gian M. N. Benucci' author = 'Julian A. Liber and Gian M. N. Benucci' # The full version, including alpha/beta/rc tags -release = '2.0.14' +release = '2.0.16' # -- General configuration --------------------------------------------------- diff --git a/docs/source/index.rst b/docs/source/index.rst index 8e1a38c..1484a23 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -12,7 +12,7 @@ taxonomic units at a given taxonomic rank) is realized by generating consensus taxonomy of available classifiers with CONSTAX. -CONSTAX 2.0.14 improves upon 1.0.0 with the following features: +CONSTAX 2.0.16 improves upon 1.0.0 with the following features: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ * **Updated software requirements, including Python 3 and Java 8** @@ -56,7 +56,7 @@ CONSTAX 1.0.0 was authored by Reference ^^^^^^^^^ -`Liber JA, Bonito G, Benucci GMN (2021) CONSTAX2: improved taxonomic classification of environmental DNA markers. Bioinformatics doi: 10.1093/bioinformatics/btab347 `_ +`Liber JA, Bonito G, Benucci GMN (2021) CONSTAX2: improved taxonomic classification of environmental DNA markers. Bioinformatics doi: 10.1093/bioinformatics/btab347 `_ `Gdanetz K, Benucci GMN, Vande Pol N, Bonito G (2017) CONSTAX: a tool for improved taxonomic resolution of environmental fungal ITS sequences. BMC Bioinformatics 18:538 doi 10.1186/s12859-017-1952-x `_ diff --git a/docs/source/installation.rst b/docs/source/installation.rst index 93c3e0f..fe98fdc 100644 --- a/docs/source/installation.rst +++ b/docs/source/installation.rst @@ -25,15 +25,15 @@ If conda is not installed (you get an error which might include ``command not fo .. code-block:: text - wget https://repo.anaconda.com/miniconda/Miniconda3-py39_4.9.2-Linux-x86_64.sh - bash Miniconda3-py39_4.9.2-Linux-x86_64.sh + wget https://repo.anaconda.com/miniconda/Miniconda3-py39_4.10.3-Linux-x86_64.sh + bash Miniconda3-py39_4.10.3-Linux-x86_64.sh .. tab:: OSX .. code-block:: text - curl -O https://repo.anaconda.com/miniconda/Miniconda3-py39_4.9.2-MacOSX-x86_64.sh - bash Miniconda3-py39_4.9.2-MacOSX-x86_64.sh + curl -O https://repo.anaconda.com/miniconda/Miniconda3-py39_4.10.3-MacOSX-x86_64.sh + bash Miniconda3-py39_4.10.3-MacOSX-x86_64.sh 2. Follow the prompts. diff --git a/docs/source/referenceDB.rst b/docs/source/referenceDB.rst index bd2c5fd..7c9739d 100644 --- a/docs/source/referenceDB.rst +++ b/docs/source/referenceDB.rst @@ -3,18 +3,19 @@ Suggested Reference Databases ============================= -Dependent on where your sequences originate (e.g. ITS, 16S, LSU), you will need to have an appropriate +Dependent on where your sequences originate (e.g. ITS, 16S, LSU), you will need to have an appropriate database with which to classify them. -For Fungi or all Eukaryotes, the `UNITE `_ database is preferred. The format of the reference database to use with -CONSTAX is one of those under the `General `_ fasta format. +For Fungi or all Eukaryotes, the `UNITE `_ database is preferred. The format of the reference database to use with +CONSTAX is one of those under the `General `_ fasta format. For the latest release (10.05.2021), +training with 32GB of RAM for Fungi only or 40GB for all Eukaryotes should be sufficient. -For Bacteria and Archaea, we recommend the `SILVA `_ reference database. +For Bacteria and Archaea, we recommend the `SILVA `_ reference database. The ``SILVA_XXX_SSURef_tax_silva.fasta.gz`` file can be gunzip-ped and used. .. Note:: - SILVA taxonomy is not assigned by Linnean ranks (*Kingdom*, *Phylum*, etc.), so instead placeholder ranks 1-n are used. - Also, the size of the SILVA database means that a server/cluster is required to train the classifier becasue - 128GB RAM for the RDP training are required. If you have a computer with 32GB of RAM, you may be able to train using + SILVA taxonomy is not assigned by Linnean ranks (*Kingdom*, *Phylum*, etc.), so instead placeholder ranks 1-n are used. + Also, the size of the SILVA database means that a server/cluster is required to train the classifier becasue + 128GB RAM for the RDP training are required. If you have a computer with 32GB of RAM, you may be able to train using the UNITE database. If you cannot train locally for UNITE, the RDP files can be downloaded from `here `_. The ``genus_wordConditionalProbList.txt.gz`` file should be gunzip-ped after downloading. diff --git a/docs/source/tutorial1.rst b/docs/source/tutorial1.rst index 5c6491d..1113df4 100644 --- a/docs/source/tutorial1.rst +++ b/docs/source/tutorial1.rst @@ -60,7 +60,7 @@ where you will add the absolute PATHs for the required software. VSEARCH, BLAST, :align: center .. warning:: - Remember to navigate through your anaconda installation and find the ``constax-2.0.14/`` folder. + Remember to navigate through your anaconda installation and find the ``constax-2.0.16/`` folder. This is the only way to make CONSTAX locate the needed python scripts. Before you can run CONSTAX you need to activate your anaconda environment (alternatively, diff --git a/docs/source/tutorial2.rst b/docs/source/tutorial2.rst index a74ea7d..68cbce7 100644 --- a/docs/source/tutorial2.rst +++ b/docs/source/tutorial2.rst @@ -17,7 +17,7 @@ The code will look like as below #SBATCH --time=10:00:00 #SBATCH --ntasks=1 #SBATCH --cpus-per-task=20 - #SBATCH --mem=128G + #SBATCH --mem=32G #SBATCH --job-name constax_fungi #SBACTH -A shade-cole-bonito diff --git a/docs/source/tutorial5.rst b/docs/source/tutorial5.rst index e76b355..36ba4ad 100644 --- a/docs/source/tutorial5.rst +++ b/docs/source/tutorial5.rst @@ -4,7 +4,11 @@ Downloading the UNITE database This tutorial is about how to obtain a reference database for classification of fungi or eukaryotes in general. These will be downloaded from `UNITE `_. -For classification of fungi, we have had tested with the `RepS 35077 General Release FASTA `_. +For classification of fungi, we have had tested with the `RepS 44343 General Release FASTA `_. + +The eukaryote database with `96423 RepS sequences `_ provides better information about the kingdom +classification of the sequence, but requires slightly more RAM (~40GB). Using the ``--high_level_taxonomy`` option can provide a similar result +but with reduced RAM requirements. .. code-block:: language @@ -14,6 +18,6 @@ For classification of fungi, we have had tested with the `RepS 35077 General Rel Use the FASTA called ``sh_general_release_fungi_35077_RepS_04.02.2020.fasta`` within the expanded directory for your fungal reference database, specified with ``-d`` or ``--db`` in your ``constax`` command. -For the ``--high_level_db`` option, the eukaryotes database found here `https://plutof.ut.ee/#/doi/10.15156/BIO/786370 `_. +For the ``--high_level_db`` option, the eukaryotes database found here `https://plutof.ut.ee/#/doi/10.15156/BIO/1280127 `_. can be used. This will help to remove non-fungal OTUs from your dataset, or can be used as the main database (``-d, --db``) for projects amplifying other eukaryotes. diff --git a/subscript_fasta_addFullLineage.py b/subscript_fasta_addFullLineage.py index 11e16a7..7aa5dd6 100644 --- a/subscript_fasta_addFullLineage.py +++ b/subscript_fasta_addFullLineage.py @@ -1,15 +1,21 @@ #!/usr/bin/python import sys, os -def RDP_head_to_UTAX(lineage): +def RDP_head_to_UTAX(lineage, format): taxa = lineage.split(";")[1:] out = "" for r in range(len(taxa)): - out = F"{out}{'dkpcofgs'[r]}:{taxa[r]}," + if format == "UNITE": + out = F"{out}{'dkpcofg'[r]}:{taxa[r]}," + else: + out = F"{out}{'dkpcofgs'[r]}:{taxa[r]}," + return out[:-1] -def addFullLineage(filebase): +def addFullLineage(filebase, format): print("\n\tAdding Full Lineage\n\n") - f1 = open(filebase+"__RDP_taxonomy_headers.txt", 'r').readlines() + + with open(filebase+"__RDP_taxonomy_headers.txt", 'r') as f: + f1 = f.readlines() hash = {} #lineage map output_RDP = open(filebase+"__RDP_trained.fasta", 'w') @@ -20,7 +26,8 @@ def addFullLineage(filebase): ID = ID.strip(">") hash[ID] = lineage - f2 = open(filebase+"__RDP.fasta", 'r').readlines() + with open(filebase+"__RDP.fasta", 'r') as f: + f2 = f.readlines() for line in f2: if line[0] == '>': ID = line.strip().replace('>', '') @@ -30,7 +37,7 @@ def addFullLineage(filebase): print(ID, 'not in taxonomy file') sys.exit() output_RDP.write(F"{line.strip()}\t{lineage}\n") - output_UTAX.write(F"{line.strip()};tax={RDP_head_to_UTAX(lineage)};\n") + output_UTAX.write(F"{line.strip()};tax={RDP_head_to_UTAX(lineage, format)};\n") else: output_RDP.write(line.strip()+"\n") output_UTAX.write(line.strip()+"\n")