From a90d03cb894bb1c694b73336fe698845182462dc Mon Sep 17 00:00:00 2001 From: Rene Date: Thu, 2 Feb 2023 10:05:14 -0500 Subject: [PATCH] updated workflow --- FCH_workflow.sh | 753 ++++++++++++++++++++++-------------- taxatarget_environment.yaml | 46 +++ 2 files changed, 505 insertions(+), 294 deletions(-) create mode 100644 taxatarget_environment.yaml diff --git a/FCH_workflow.sh b/FCH_workflow.sh index 7133881..2b004eb 100644 --- a/FCH_workflow.sh +++ b/FCH_workflow.sh @@ -1,8 +1,12 @@ -#!/bin/basH +#!/bin/bash # Copyright (c) 2022, René KM Xavier set -e # setting to break on newline only IFS=$'\n' + + +set -o errtrace # Enable the err trap, code will get called when an error is detected +trap "echo ERROR: There was an error check error log" ERR ################### #### VARIABLES #### ################### @@ -12,6 +16,10 @@ read -r -p "What is your project name? No spaces or special characters please. read -r -p "How much RAM do you have available in gigabytes (Gb)? Use digits only. " RAM || exit 100 read -r -p "How many cores do you want to use? Use digits only " CPU || exit 100 read -r -p "Create a samples.txt file with unique sample ids without spaces, saved in the same directory as the raw sequencing reads. Do you have the samples.txt file made? yes or no: " samples || exit 100 +read -r -p "Do you want to count the raw sequencing reads? yes or no: " count || exit 100 +read -r -p "Do you want to preprocess your raw sequencing reads? yes or no: " reads_QC || exit 100 +read -r -p "Do you want to analyze your HQ reads for Eukaryotes? yes or no: " taxatarget || exit 100 +read -r -p "Do you want to analyze your HQ reads for trans-kingdom taxonomy? yes or no :" metaphlan || exit 100 # Be sure samples.txt is formatted correctly. if [[ "$samples" == no ]] then @@ -25,22 +33,21 @@ cd ${project_dir} ## PREPROCESSING ## ################### mkdir -p ./00_Raw_Reads/fastqc -mkdir -p ./01_HQ_Reads/merged_reads/fastqc -mkdir -p ./01_HQ_Reads/error_corrected/fastqc +mkdir -p ./01_HQ_Reads/merged_reads +mkdir -p ./01_HQ_Reads/error_corrected mkdir -p ./01_HQ_Reads/fastqc mkdir -p ./01_HQ_Reads/logs -mkdir -p ./01_HQ_Reads/nonpareil -mkdir -p ./01_HQ_Reads/phyloFlash # Copy file of unique sample ids into the project directory cp ${reads_dir}/samples.txt . # Get a quick look at the size of each sample. -read -r -p "Do you want to count the raw sequencing reads? yes or no: " count || exit 100 +#read -r -p "Do you want to count the raw sequencing reads? yes or no: " count || exit 100 if [[ "${count}" == yes ]]; then - for file in `ls ${reads_dir}/*.f*q.gz`; do - echo Counting all the sequences in $file - counts=$(gunzip -c $file | grep '^@' | wc -l) - echo $file,$counts >> ${reads_dir}/00_Raw_Reads_counts.csv - done + while read -r sample; do + echo Counting all the sequences in $sample + reformat.sh in=${reads_dir}/${sample}_R1.fq.gz in2=${reads_dir}/${sample}_R2.fq.gz > ${reads_dir}/${sample}_counts.txt 2>&1 + counts=$(grep 'Output:' ${reads_dir}/${sample}_counts.txt | cut -f2,3) + echo -e "${sample}\t${counts}" >> ${project_dir}/00_Raw_Reads_counts.tsv + done < samples.txt elif [[ "${count}" == no ]]; then : fi ############################ @@ -52,108 +59,208 @@ fi ############### # Install bbmap, fastqc, and multiqc in a conda environment: # conda create -y -n pp bbmap fastqc multiqc -read -r -p "Do you want to preprocess your raw sequencing reads? yes or no: " reads_QC || exit 100 +#read -r -p "Do you want to preprocess your raw sequencing reads? yes or no: " reads_QC || exit 100 if [[ "${reads_QC}" == yes ]] then conda activate pp - while read -r sample; do + if test -f "./01_HQ_Reads/${sample}_HQ.fq"; then : + else + #Input raw reads from 00_Raw_Reads directory + + R1=( ${reads_dir}/${sample}*1.f*q.gz ) + R2=( ${reads_dir}/${sample}*2.f*q.gz ) + + # echo Assessing intial read quality on ${sample} raw reads in ${reads_dir} + # fastqc $R1 $R2 -q -o ./00_Raw_Reads/fastqc + ########################### + #### Format Conversion #### + ########################### + #Create interleaved file and verify paired end reads. + reformat.sh in=$R1 in2=$R2 out=./01_HQ_Reads/iReads.fq.gz verifypaired=t ow >> ./01_HQ_Reads/logs/${sample}.log 2>&1 + + echo Starting to quality filter ${sample} raw reads + + ########################### + #### Adapter Trimming #### + ########################### + bbduk.sh in=./01_HQ_Reads/iReads.fq.gz out=./01_HQ_Reads/atrimmed.fq.gz ref=adapters ktrim=r k=23 mink=11 hdist=1 tpe tbo ow=t >> ./01_HQ_Reads/logs/${sample}.log 2>&1 + + ########################### + #### Quality Trimming #### + ########################### + #removes Ns from sequences + # 'entropy' means to filter out reads with low complexity + # 'maq' is 'mininum average quality' to filter out overall poor reads + #trimpolygright=0 will remove poly G's from the 3' end that get added erroneously because 2-dye chemistry cannot call 'undefined' bases. Undefined bases get called as "G". + #ftm=5 removes the extra base sometimes added by the sequencer. i.e.: if a read is 151bp it will trim it to 150bp, but leave a 150bp read alone. + bbduk.sh in=./01_HQ_Reads/atrimmed.fq.gz out=./01_HQ_Reads/qtrimmed.fq.gz qtrim=rl trimq=6 ftm=5 trimpolygright=0 minlen=70 ordered maxns=0 maq=8 entropy=.95 ow=t >> ./01_HQ_Reads/logs/${sample}.log 2>&1 + + ########## NOT RECOMMENDED ############## + #ftl=16 to remove the 16bp of barcode left on the 5' read. Removes Nextera 5' noise + #bbduk.sh in=./01_HQ_Reads/atrimmed.fq.gz out=./01_HQ_Reads/qtrimmed.fq.gz ftl=16 qtrim=rl trimq=6 ftm=5 trimpolygright=0 minlen=70 ordered maxns=0 maq=8 entropy=.95 ow=t >> ./01_HQ_Reads/logs/${sample}.log 2>&1 + ######################################### + + ########################### + ## Contaminant filtering ## + ########################### + #Remove synthetic artifacts and spike-ins by kmer-matching + # 'cardinality' will generate an accurate estimation of the number of unique kmers in the dataset using the LogLog algorithm + bbduk.sh in=./01_HQ_Reads/qtrimmed.fq.gz out=./01_HQ_Reads/${sample}_HQ.fq.gz ref=artifacts,phix k=31 hdist=1 stats=./01_HQ_Reads/logs/${sample}_contamination_stats.txt ordered cardinality ow=t >> ./01_HQ_Reads/logs/${sample}.log 2>&1 + + echo finished bbduk + # remove extra files + rm ./01_HQ_Reads/iReads.fq.gz ./01_HQ_Reads/atrimmed.fq.gz ./01_HQ_Reads/qtrimmed.fq.gz >> ./01_HQ_Reads/logs/${sample}.log 2>&1 + #Merge reads + echo merging reads + bbmerge-auto.sh in=./01_HQ_Reads/${sample}_HQ.fq.gz out=./01_HQ_Reads/merged_reads/${sample}_HQ_merged.fq.gz outu=./01_HQ_Reads/merged_reads/${sample}_HQ_unmerged.fq.gz rem k=62 extend2=50 ecct vstrict ow=t >> ./01_HQ_Reads/logs/${sample}.log 2>&1 + echo starting error correction + bbcms.sh in=./01_HQ_Reads/${sample}_HQ.fq.gz out=./01_HQ_Reads/error_corrected/${sample}_HQ_ecc.fq.gz outb=./01_HQ_Reads/error_corrected/low_${sample}.fq.gz bits=4 hashes=3 k=31 mincount=2 hcf=0.4 tossjunk=t ow=t >> ./01_HQ_Reads/logs/${sample}.log 2>&1 + + #Create R1.fa for nonpareil and phyloflash input + reformat.sh in=./01_HQ_Reads/error_corrected/${sample}_HQ_ecc.fq.gz out=./01_HQ_Reads/error_corrected/${sample}_HQ_ecc_R1.fa out2=./01_HQ_Reads/error_corrected/${sample}_HQ_ecc_R2.fa verifypaired ow=t >> ./01_HQ_Reads/logs/${sample}.log 2>&1 + #Create fastq R1 and R2 input files for TaxaTarget + reformat.sh in=./01_HQ_Reads/${sample}_HQ.fq.gz out=./01_HQ_Reads/${sample}_HQ_R1.fastq out2=./01_HQ_Reads/${sample}_HQ_R2.fastq verifypaired ow=t >> ./01_HQ_Reads/logs/${sample}.log 2>&1 + # Create interleaved fastq file for MetaPhlAn + gunzip -k ./01_HQ_Reads/${sample}_HQ.fq.gz + echo finished ${sample} - #Input raw reads from 00_Raw_Reads directory - - R1=( ${reads_dir}/${sample}*1.f*q.gz ) - R2=( ${reads_dir}/${sample}*2.f*q.gz ) - - echo Assessing intial read quality on ${sample} raw reads in ${reads_dir} - fastqc $R1 $R2 -q -o ./00_Raw_Reads/fastqc - ########################### - #### Format Conversion #### - ########################### - #Create interleaved file and verify paired end reads. - reformat.sh in=$R1 in2=$R2 out=./01_HQ_Reads/iReads.fq.gz verifypaired=t ow >> ./01_HQ_Reads/logs/${sample}.log 2>&1 - - echo Starting to quality filter ${sample} raw reads - - ########################### - #### Adapter Trimming #### - ########################### - bbduk.sh in=./01_HQ_Reads/iReads.fq.gz out=./01_HQ_Reads/atrimmed.fq.gz ref=adapters ktrim=r k=23 mink=11 hdist=1 tpe tbo ow=t >> ./01_HQ_Reads/logs/${sample}.log 2>&1 - - ########################### - #### Quality Trimming #### - ########################### - #removes Ns from sequences - # 'entropy' means to filter out reads with low complexity - # 'maq' is 'mininum average quality' to filter out overall poor reads - #trimpolygright=0 will remove poly G's from the 3' end that get added erroneously because 2-dye chemistry cannot call 'undefined' bases. Undefined bases get called as "G". - #ftm=5 removes the extra base sometimes added by the sequencer. i.e.: if a read is 151bp it will trim it to 150bp, but leave a 150bp read alone. - #bbduk.sh in=./01_HQ_Reads/atrimmed.fq.gz out=./01_HQ_Reads/qtrimmed.fq.gz qtrim=rl trimq=6 ftm=5 trimpolygright=0 minlen=70 ordered maxns=0 maq=8 entropy=.95 ow=t >> ./01_HQ_Reads/logs/${sample}.log 2>&1 - - #ftl=16 to remove the 16bp of barcode left on the 5' read. Removes Nextera 5' noise - bbduk.sh in=./01_HQ_Reads/atrimmed.fq.gz out=./01_HQ_Reads/qtrimmed.fq.gz ftl=16 qtrim=rl trimq=6 ftm=5 trimpolygright=0 minlen=70 ordered maxns=0 maq=8 entropy=.95 ow=t >> ./01_HQ_Reads/logs/${sample}.log 2>&1 - - ########################### - ## Contaminant filtering ## - ########################### - #Remove synthetic artifacts and spike-ins by kmer-matching - # 'cardinality' will generate an accurate estimation of the number of unique kmers in the dataset using the LogLog algorithm - bbduk.sh in=./01_HQ_Reads/qtrimmed.fq.gz out=./01_HQ_Reads/${sample}_HQ.fq.gz ref=artifacts,phix k=31 hdist=1 stats=./01_HQ_Reads/logs/${sample}_contamination_stats.txt ordered cardinality ow=t >> ./01_HQ_Reads/logs/${sample}.log 2>&1 - - echo finished bbduk - # remove extra files - rm ./01_HQ_Reads/iReads.fq.gz ./01_HQ_Reads/atrimmed.fq.gz ./01_HQ_Reads/qtrimmed.fq.gz >> ./01_HQ_Reads/logs/${sample}.log 2>&1 - #Merge reads - echo merging reads - bbmerge-auto.sh in=./01_HQ_Reads/${sample}_HQ.fq.gz out=./01_HQ_Reads/merged_reads/${sample}_HQ_merged.fq.gz outu=./01_HQ_Reads/merged_reads/${sample}_HQ_unmerged.fq.gz rem k=62 extend2=50 ecct vstrict ow=t >> ./01_HQ_Reads/logs/${sample}.log 2>&1 - echo starting error correction - bbcms.sh in=./01_HQ_Reads/${sample}_HQ.fq.gz out=./01_HQ_Reads/error_corrected/${sample}_HQ_ecc.fq.gz outb=./01_HQ_Reads/error_corrected/low_${sample}.fq.gz bits=4 hashes=3 k=31 mincount=2 hcf=0.4 tossjunk=t ow=t >> ./01_HQ_Reads/logs/${sample}.log 2>&1 - - #Create R1.fa for nonpareil and phyloFlash input - reformat.sh in=./01_HQ_Reads/error_corrected/${sample}_HQ_ecc.fq.gz out=./01_HQ_Reads/error_corrected/${sample}_HQ_ecc_R1.fa out2=./01_HQ_Reads/error_corrected/${sample}_HQ_ecc_R2.fa verifypaired >> ./01_HQ_Reads/logs/${sample}.log 2>&1 - - echo finished ${sample} - - + fi done < samples.txt - + if test -f "./01_HQ_Reads/multiqc_report.html"; then : + else #Run multiqc on all clean Paired End reads. - echo Assessing quality of processed ${project} reads. - find ./01_HQ_Reads/ -name "*.fq.gz" -exec fastqc '{}' -q -o ./01_HQ_Reads/fastqc \; - multiqc ./01_HQ_Reads/fastqc -o ./01_HQ_Reads/ + echo Assessing quality of processed ${project} reads. + find ./01_HQ_Reads/ -name "*_HQ.fq.gz" -exec fastqc '{}' -t 6 -q -o ./01_HQ_Reads/fastqc \; + multiqc ./01_HQ_Reads/fastqc -o ./01_HQ_Reads/ + fi conda deactivate - echo Finished read quality control. Starting analysis of sequencing effort. - echo + ######################################################################################################################################### + ############################## READ-BASED ANALYSIS ############################## + ######################################################################################################################################### - conda activate nonpareil + echo Finished ${sample} read quality control. Starting analysis of sequencing effort. + conda activate nonpareil + mkdir -p ./01_HQ_Reads/nonpareil while read -r sample; do - echo Estimating sequencing effort for ${sample} - nonpareil -s ./01_HQ_Reads/error_corrected/${sample}_HQ_ecc_R1.fa -T alignment -L 35 -t ${CPU} -R ${RAM}000 -b ./01_HQ_Reads/nonpareil/${sample} + if test -f "./01_HQ_Reads/nonpareil/${sample}.log"; then : + else + echo Estimating sequencing effort for ${sample} + nonpareil -s ./01_HQ_Reads/error_corrected/${sample}_HQ_ecc_R1.fa -T alignment -f fasta -t 12 -b ./01_HQ_Reads/nonpareil/${sample} >> ./01_HQ_Reads/nonpareil/${sample}.log 2>&1 + #nonpareil -s ./01_HQ_Reads/error_corrected/${sample}_HQ_ecc_R1.fa -T alignment -L 35 -t ${CPU} -R ${RAM}000 -b ./01_HQ_Reads/nonpareil/${sample} >> ./01_HQ_Reads/nonpareil/${sample}.log 2>&1 + fi done < samples.txt conda deactivate - + ########################### + ## READ-BASED TAXONOMY ## + ########################### echo Starting assembly of rRNA SSU and classification conda activate phyloflash - + mkdir -p ./02_Taxonomy/phyloflash while read -r sample; do - cd ${project_dir}/01_HQ_Reads/phyloFlash - echo running phyloFlash on ${sample} - phyloFlash.pl -lib ${sample} -read1 ${project_dir}/01_HQ_Reads/error_corrected/${sample}_HQ_ecc_R1.fa -read2 ${project_dir}/01_HQ_Reads/error_corrected/${sample}_HQ_ecc_R2.fa -CPU ${CPU} -almosteverything -taxlevel 20 + if test -f "${project_dir}/02_Taxonomy/phyloflash/${sample}.phyloFlash.log"; then : + else + cd ${project_dir}/02_Taxonomy/phyloflash + echo Mining ${sample} metagenome for rRNA genes + phyloFlash.pl -lib ${sample} -read1 ${project_dir}/01_HQ_Reads/error_corrected/${sample}_HQ_ecc_R1.fa -read2 ${project_dir}/01_HQ_Reads/error_corrected/${sample}_HQ_ecc_R2.fa -CPU ${CPU} -almosteverything -taxlevel 20 -log + fi + done < samples.txt + conda deactivate +elif [[ "${reads_QC}" == no ]]; then : +fi + +# Install taxaTarget and dependencies following instructions at: https://github.com/SethCommichaux/taxaTarget +#read -r -p "Do you want to analyze your HQ reads for Eukaryotes? yes or no: " taxatarget || exit 100 +if [[ "${taxatarget}" == no ]]; then : +elif [[ "${taxatarget}" == yes ]]; then + #statements + mkdir -p ./02_Taxonomy/taxatarget + + source activate taxatarget + + # Run taxaTarget on sample reads + for sample in `cat samples.txt`; do + if test -f "./02_Taxonomy/taxatarget/${sample}_results/Taxonomic_report.txt"; then : + else + + echo Analyzing ${sample} sample for Eukaryotes + HQ1=./01_HQ_Reads/${sample}_HQ_R1.fastq + HQ2=./01_HQ_Reads/${sample}_HQ_R2.fastq + + python /home/fch/GitHub/taxaTarget/run_pipeline_scripts/run_protist_pipeline_fda.py -r ${HQ1} -r2 ${HQ2} -e /home/fch/GitHub/taxaTarget/run_pipeline_scripts/environment.txt -o ./02_Taxonomy/taxatarget/${sample}_results -t ${CPU} >> ./02_Taxonomy/taxatarget/${sample}.log 2>&1 + python /home/fch/bin/modify_taxonomic_report.py ./02_Taxonomy/taxatarget/${sample}_results/Taxonomic_report.txt + fi + done + if test -f "${project_dir}/02_Taxonomy/taxatarget/files.txt"; then : + else + # Data Wrangling step. Create input file for aggregate_results.py + ls ${project_dir}/02_Taxonomy/taxatarget/*_results/Taxonomic_report.txt.modified > tmp.txt + for line in `cat tmp.txt`; do + sample=$(echo ${line} | rev | cut -f2 -d '/' | rev | cut -f1 -d '_') + echo -e "${sample}\t${line}" >> ${project_dir}/02_Taxonomy/taxatarget/files.txt + done + rm tmp.txt + # Aggregate TaxaTarget results into one matrix for statistical analysis + cd ${project_dir}/02_Taxonomy/taxatarget + python /home/fch/bin/aggregate_results.py files.txt cd ${project_dir} + fi + conda deactivate +fi +cd ${project_dir} +# Install metaphlan into conda environment +# conda create -n metaphlan -y -c bioconda python=3.7 metaphlan +#activate metaphlan conda environment +# conda activate metaphlan +# Install metaphlan databases +# metaphlan --install --bowtie2db +#read -r -p "Do you want to analyze your HQ reads for trans-kingdom taxonomy? yes or no :" metaphlan || exit 100 +# MetaPhlAn introduces the capability of characterizing organisms at the strain level using non +# aggregated marker information. Such capability comes with several slightly different flavours and +# are a way to perform strain tracking and comparison across multiple samples. +# Usually, MetaPhlAn is first ran with the default -t to profile the species present in +# the community, and then a strain-level profiling can be performed to zoom-in into specific species +# of interest. This operation can be performed quickly as it exploits the --bowtie2out intermediate +# file saved during the execution of the default analysis type. +if [[ "${metaphlan}" == no ]]; then : +elif [[ "${metaphlan}" == yes ]]; then + conda activate metaphlan + while read -r sample; do + if test -f "./02_Taxonomy/metaphlan/${sample}/${sample}_viral_profile.txt"; then : + else + mkdir -p ./02_Taxonomy/metaphlan/${sample} + echo Creating trans-kingdom taxonomic profile for ${sample} + #metaphlan ./01_HQ_Reads/${sample}_HQ.fq --input_type fastq --add_viruses --bowtie2out ./02_Taxonomy/metaphlan/${sample}/${sample}_viral_bowtie2.bz2 --nproc ${CPU} -o ./02_Taxonomy/metaphlan/${sample}/${sample}_viral_profile.txt + metaphlan ./01_HQ_Reads/${sample}_HQ.fq --input_type fastq --unknown_estimation --add_viruses --bowtie2out ./02_Taxonomy/metaphlan/${sample}/${sample}_viral_ukn_bowtie2.bz2 --nproc ${CPU} -o ./02_Taxonomy/metaphlan/${sample}/${sample}_viral_ukn_profile.txt + #metaphlan ./01_HQ_Reads/${sample}_HQ.fq --input_type fastq --bowtie2out ./02_Taxonomy/metaphlan/${sample}/${sample}_bowtie2.bz2 --nproc ${CPU} -o ./02_Taxonomy/metaphlan/${sample}/${sample}_profile.txt + #metaphlan ./01_HQ_Reads/${sample}_HQ.fq --input_type fastq --unknown_estimation --bowtie2out ./02_Taxonomy/metaphlan/${sample}/${sample}_ukn_bowtie2.bz2 --nproc ${CPU} -o ./02_Taxonomy/metaphlan/${sample}/${sample}_ukn_profile.txt + fi done < samples.txt + if test -f "./02_Taxonomy/metaphlan/profiles_merged/${project}_viral_profile.txt"; then : + else + #Make a matrix of taxonomy data + mkdir -p ./02_Taxonomy/metaphlan/profiles_merged + #merge_metaphlan_tables.py ./02_Taxonomy/metaphlan/*/*_viral_profile.txt > ./02_Taxonomy/metaphlan/profiles_merged/${project}_viral_profile.txt + merge_metaphlan_tables.py ./02_Taxonomy/metaphlan/*/*_viral_ukn_profile.txt > ./02_Taxonomy/metaphlan/profiles_merged/${project}_viral_ukn_profile.txt + #merge_metaphlan_tables.py ./02_Taxonomy/metaphlan/*/*_profile.txt > ./02_Taxonomy/metaphlan/profiles_merged/${project}_profile.txt + #merge_metaphlan_tables.py ./02_Taxonomy/metaphlan/*/*_ukn_profile.txt > ./02_Taxonomy/metaphlan/profiles_merged/${project}_ukn_profile.txt + fi conda deactivate - -elif [[ "${reads_QC}" == no ]]; then : fi +######################################################################################################################################### +############################## ASSEMBLY-BASED ANALYSIS ######################### +######################################################################################################################################### + # The following assembly workflow creates assemblies for three debruijn graph assemblers and compares them for each sample. ############### ### CONDA ##### @@ -173,10 +280,13 @@ if [[ "${assemble}" == yes ]]; then ################ ### Assembly ### ################ + read -r -p "Which HQ reads dataset do you want to use to assemble? HQ HQ_ecc HQ_merged Skip: " HQ_Reads || exit 100 + read -r -p "Would you like to evaluate the assemblies? yes or no: " quast || exit 100 + for sample in `cat samples.txt`; do + mkdir -p ./02_Assembly/quast/${sample} conda activate assembly - read -r -p "Which HQ reads dataset do you want to use to assemble $sample? HQ HQ_ecc HQ_merged Skip: " HQ_Reads || exit 100 if [[ "${HQ_Reads}" == HQ ]] then @@ -193,12 +303,12 @@ if [[ "${assemble}" == yes ]]; then else #Assemble reads per site using megahit echo Using megahit to assemble ${HQ_Reads} ${sample} reads. - # megahit --12 ./01_HQ_Reads/${sample}_HQ.fq.gz --presets meta-large -m ${RAM} -t ${CPU} -o ~/${sample} >> ./02_Assembly/megahit-large/logs/${sample}.log 2>&1 - megahit --12 ./01_HQ_Reads/${sample}_HQ.fq.gz --k-min 21 --k-max 91 --k-step 10 -m ${RAM} -t ${CPU} -o ~/${sample} >> ./02_Assembly/megahit-large/logs/${sample}.log 2>&1 + megahit --12 ./01_HQ_Reads/${sample}_HQ.fq.gz --presets meta-large -m 0.5 -t ${CPU} -o ~/${sample} >> ./02_Assembly/megahit-large/logs/${sample}.log 2>&1 cp ~/${sample}/final.contigs.fa ./02_Assembly/quast/${sample}/megahit.fasta mkdir -p ./02_Assembly/megahit-large/${sample} mv ~/${sample} ./02_Assembly/megahit-large fi + if test -f "./01_HQ_Reads/${sample}_HQ.fa"; then : else #IDBA can only take fasta files as input; therefore, fastq.gz files must be reformated to fasta files @@ -215,18 +325,23 @@ if [[ "${assemble}" == yes ]]; then cp ./02_Assembly/idba_ud/${sample}/scaffold.fa ./02_Assembly/quast/${sample}/idba.fasta fi - if test -f "./02_Assembly/stats/${sample}_assembly_stats.csv"; then : + if grep -m 1 -q "${HQ_Reads}" ./02_Assembly/stats/${sample}_assembly_summary.tsv; then : else # --- Evaluation --- - echo metagenome_id,reads,assembler,n_contigs,contig_bp,gap_pct,ctg_L50,ctg_max,gene_counts,mapped_pct,avg_cov,assembly_performance >> ./02_Assembly/stats/${sample}_assembly_stats.csv + echo -e "metagenome_id\treads\tassembler\tn_contigs\tcontig_bp\tgap_pct\tctg_L50\tctg_max\tgene_counts\tmapped_pct\tavg_cov\tassembly_performance" >> ./02_Assembly/stats/${sample}_assembly_summary.tsv echo Evaluating ${sample} assemblies with AssemblyStats ##statswrapper.sh ./02_Assembly/quast/${sample}/*.fasta minscaf=1000 format=3 -Xmx${RAM}G ow=t out=./02_Assembly/stats/${sample}.txt stats.sh ./02_Assembly/metaspades/${sample}/scaffolds.fasta minscaf=1000 format=3 -Xmx${RAM}G ow=t out=./02_Assembly/metaspades/${sample}/stats.txt - spades_stats=$(cat ./02_Assembly/metaspades/${sample}/stats.txt | cut -f2,4,5,9,15 -d$'\t' | sed '2q;d' | sed 's/\t/,/g') + spades_stats=$(cat ./02_Assembly/metaspades/${sample}/stats.txt | cut -f2,4,5,9,15 -d$'\t' | sed '2q;d') stats.sh ./02_Assembly/megahit-large/${sample}/final.contigs.fa minscaf=1000 format=3 -Xmx${RAM}G ow=t out=./02_Assembly/megahit-large/${sample}/stats.txt - megahit_stats=$(cat ./02_Assembly/megahit-large/${sample}/stats.txt | cut -f2,4,5,9,15 -d$'\t' | sed '2q;d' | sed 's/\t/,/g') + megahit_stats=$(cat ./02_Assembly/megahit-large/${sample}/stats.txt | cut -f2,4,5,9,15 -d$'\t' | sed '2q;d') stats.sh ./02_Assembly/idba_ud/${sample}/scaffold.fa minscaf=1000 format=3 -Xmx${RAM}G ow=t out=./02_Assembly/idba_ud/${sample}/stats.txt - idba_stats=$(cat ./02_Assembly/idba_ud/${sample}/stats.txt | cut -f2,4,5,9,15 -d$'\t' | sed '2q;d' | sed 's/\t/,/g') + idba_stats=$(cat ./02_Assembly/idba_ud/${sample}/stats.txt | cut -f2,4,5,9,15 -d$'\t' | sed '2q;d') + + spades_L50=$(cat ./02_Assembly/metaspades/${sample}/stats.txt | cut -f9 -d$'\t' | sed '2q;d') + megahit_L50=$(cat ./02_Assembly/megahit-large/${sample}/stats.txt | cut -f9 -d$'\t' | sed '2q;d') + idba_L50=$(cat ./02_Assembly/idba_ud/${sample}/stats.txt | cut -f9 -d$'\t' | sed '2q;d') + #Evaluate assemblies with Quast (leave out "-R ref.fa if you don't have a reference) echo Predicting prokaryotic genes in each ${sample} assembly @@ -247,25 +362,25 @@ if [[ "${assemble}" == yes ]]; then # Note that these are reads mapped to contigs > 2,000 bp. #(i.e. % mapped) by mapping reads by to assembly bbmap.sh in=./01_HQ_Reads/${sample}_HQ.fq.gz ref=./02_Assembly/metaspades/${sample}/${sample}_c1k.fa nodisk covhist=./02_Assembly/metaspades/${sample}/covhist.txt covstats=./02_Assembly/metaspades/${sample}/covstats.txt out=./02_Assembly/metaspades/${sample}/${sample}_reads_assembled.fq.gz outu=./02_Assembly/metaspades/${sample}/${sample}_reads_unassembled.fq.gz fast=t ambig=best >> ./02_Assembly/metaspades/${sample}/map.log 2>&1 - spades_match=$(grep Percent\ mapped: ./02_Assembly/metaspades/${sample}/map.log | cut -f2 -d$'\t' | tail -1 | tail -1) - spades_cov=$(grep Average\ coverage: ./02_Assembly/metaspades/${sample}/map.log | cut -f2 -d$'\t' | tail -1 | tail -1) - spades_Pa=$(echo "$spades_match*$spades_cov" | bc) + spades_match=$(grep Percent\ mapped: ./02_Assembly/metaspades/${sample}/map.log | cut -f2 -d$'\t' | tail -1) + spades_cov=$(grep Average\ coverage: ./02_Assembly/metaspades/${sample}/map.log | cut -f2 -d$'\t' | tail -1) + spades_Pa=$(echo "$spades_match*$spades_L50" | bc) bbmap.sh in=./01_HQ_Reads/${sample}_HQ.fq.gz ref=./02_Assembly/megahit-large/${sample}/${sample}_c1k.fa nodisk covhist=./02_Assembly/megahit-large/${sample}/covhist.txt covstats=./02_Assembly/megahit-large/${sample}/covstats.txt outm=./02_Assembly/megahit-large/${sample}/${sample}_reads_assembled.fq.gz outu=./02_Assembly/megahit-large/${sample}/${sample}_reads_unassembled.fq.gz fast=t ambig=best >> ./02_Assembly/megahit-large/${sample}/map.log 2>&1 - megahit_match=$(grep Percent\ mapped: ./02_Assembly/megahit-large/${sample}/map.log | cut -f2 -d$'\t' | tail -1 | tail -1) - megahit_cov=$(grep Average\ coverage: ./02_Assembly/megahit-large/${sample}/map.log | cut -f2 -d$'\t' | tail -1 | tail -1) - megahit_Pa=$(echo "$megahit_match*$megahit_cov" | bc) + megahit_match=$(grep Percent\ mapped: ./02_Assembly/megahit-large/${sample}/map.log | cut -f2 -d$'\t' | tail -1) + megahit_cov=$(grep Average\ coverage: ./02_Assembly/megahit-large/${sample}/map.log | cut -f2 -d$'\t' | tail -1) + megahit_Pa=$(echo "$megahit_match*$megahit_L50" | bc) bbmap.sh in=./01_HQ_Reads/${sample}_HQ.fq.gz ref=./02_Assembly/idba_ud/${sample}/${sample}_c1k.fa nodisk covhist=./02_Assembly/idba_ud/${sample}/covhist.txt covstats=./02_Assembly/idba_ud/${sample}/covstats.txt outm=./02_Assembly/idba_ud/${sample}/${sample}_reads_assembled.fq.gz outu=./02_Assembly/idba_ud/${sample}/${sample}_reads_unassembled.fq.gz fast=t ambig=best >> ./02_Assembly/idba_ud/${sample}/map.log 2>&1 - idba_match=$(grep Percent\ mapped: ./02_Assembly/idba_ud/${sample}/map.log | cut -f2 -d$'\t' | tail -1 | tail -1) - idba_cov=$(grep Average\ coverage: ./02_Assembly/idba_ud/${sample}/map.log | cut -f2 -d$'\t' | tail -1 | tail -1) - idba_Pa=$(echo "$idba_match*$idba_cov" | bc) + idba_match=$(grep Percent\ mapped: ./02_Assembly/idba_ud/${sample}/map.log | cut -f2 -d$'\t' | tail -1) + idba_cov=$(grep Average\ coverage: ./02_Assembly/idba_ud/${sample}/map.log | cut -f2 -d$'\t' | tail -1) + idba_Pa=$(echo "$idba_match*$idba_L50" | bc) echo Creating assembly comparision for ${sample} - echo ${sample},${HQ_Reads},metaspades,${spades_stats},${spades_counts},${spades_match},${spades_cov},${spades_Pa} >> ./02_Assembly/stats/${sample}_assembly_stats.csv - echo ${sample},${HQ_Reads},megahit,${megahit_stats},${megahit_counts},${megahit_match},${megahit_cov},${megahit_Pa} >> ./02_Assembly/stats/${sample}_assembly_stats.csv - echo ${sample},${HQ_Reads},idba_ud,${idba_stats},${idba_counts},${idba_match},${idba_cov},${idba_Pa} >> ./02_Assembly/stats/${sample}_assembly_stats.csv + echo -e "${sample}\t${HQ_Reads}\tmetaspades\t${spades_stats}\t${spades_counts}\t${spades_match}\t${spades_cov}\t${spades_Pa}" >> ./02_Assembly/stats/${sample}_assembly_summary.tsv + echo -e "${sample}\t${HQ_Reads}\tmegahit\t${megahit_stats}\t${megahit_counts}\t${megahit_match}\t${megahit_cov}\t${megahit_Pa}" >> ./02_Assembly/stats/${sample}_assembly_summary.tsv + echo -e "${sample}\t${HQ_Reads}\tidba_ud\t${idba_stats}\t${idba_counts}\t${idba_match}\t${idba_cov}\t${idba_Pa}" >> ./02_Assembly/stats/${sample}_assembly_summary.tsv fi @@ -276,7 +391,7 @@ if [[ "${assemble}" == yes ]]; then else echo Using metaspades to assemble ${HQ_Reads} ${sample} reads. mkdir -p ./02_Assembly/metaspades/${sample}_ecc - metaspades.py -k 21,33,55,77,99,127 --12 ./01_HQ_Reads/error_corrected/${sample}_HQ_ecc.fq.gz -t ${CPU} -m ${RAM} -o ./02_Assembly/metaspades/${sample}_ecc >> ./02_Assembly/metaspades/logs/${sample}.log 2>&1 + metaspades.py -k 21,33,55,77,99,127 --only-assembler --12 ./01_HQ_Reads/error_corrected/${sample}_HQ_ecc.fq.gz -t ${CPU} -m ${RAM} -o ./02_Assembly/metaspades/${sample}_ecc >> ./02_Assembly/metaspades/logs/${sample}.log 2>&1 cp ./02_Assembly/metaspades/${sample}_ecc/scaffolds.fasta ./02_Assembly/quast/${sample}/metaspades_ecc.fasta fi @@ -286,8 +401,7 @@ if [[ "${assemble}" == yes ]]; then #Assemble reads per site using megahit echo Using megahit to assemble ${HQ_Reads} ${sample} reads. - # megahit --12 ./01_HQ_Reads/error_corrected/${sample}_HQ_ecc.fq.gz --presets meta-large -m ${RAM} -t ${CPU} -o ./02_Assembly/megahit-large/${sample} >> ./02_Assembly/megahit-large/logs/${sample}.log 2>&1 - megahit --12 ./01_HQ_Reads/error_corrected/${sample}_HQ_ecc.fq.gz --k-min 21 --k-max 91 --k-step 10 -m ${RAM} -t ${CPU} -o ~/${sample}_ecc >> ./02_Assembly/megahit-large/logs/${sample}.log 2>&1 + megahit --12 ./01_HQ_Reads/error_corrected/${sample}_HQ_ecc.fq.gz --presets meta-large -m 0.5 -t ${CPU} -o ./02_Assembly/megahit-large/${sample} >> ./02_Assembly/megahit-large/logs/${sample}.log 2>&1 cp ~/${sample}_ecc/final.contigs.fa ./02_Assembly/quast/${sample}/megahit_ecc.fasta mkdir -p ./02_Assembly/megahit-large/${sample}_ecc mv ~/${sample}_ecc/ ./02_Assembly/megahit-large @@ -312,18 +426,23 @@ if [[ "${assemble}" == yes ]]; then cp ./02_Assembly/idba_ud/${sample}_ecc/scaffold.fa ./02_Assembly/quast/${sample}/idba_ecc.fasta fi - if test -f "./02_Assembly/stats/${sample}_assembly_stats.csv"; then : + if grep -m 1 -q "${HQ_Reads}" ./02_Assembly/stats/${sample}_assembly_summary.tsv; then : else # --- Evaluation --- - echo metagenome_id,reads,assembler,n_contigs,contig_bp,gap_pct,ctg_L50,ctg_max,gene_counts,mapped_pct,avg_cov,assembly_performance >> ./02_Assembly/stats/${sample}_assembly_stats.csv + echo -e "metagenome_id\treads\tassembler\tn_contigs\tcontig_bp\tgap_pct\tctg_L50\tctg_max\tgene_counts\tmapped_pct\tavg_cov\tassembly_performance" >> ./02_Assembly/stats/${sample}_assembly_summary.tsv echo Evaluating ${sample} assemblies with AssemblyStats stats.sh ./02_Assembly/metaspades/${sample}_ecc/scaffolds.fasta minscaf=1000 format=3 -Xmx${RAM}G ow=t out=./02_Assembly/metaspades/${sample}_ecc/stats.txt - spades_stats=$(cat ./02_Assembly/metaspades/${sample}_ecc/stats.txt | cut -f2,4,5,9,15 -d$'\t' | sed '2q;d' | sed 's/\t/,/g') + spades_stats=$(cat ./02_Assembly/metaspades/${sample}_ecc/stats.txt | cut -f2,4,5,9,15 -d$'\t' | sed '2q;d') stats.sh ./02_Assembly/megahit-large/${sample}_ecc/final.contigs.fa minscaf=1000 format=3 -Xmx${RAM}G ow=t out=./02_Assembly/megahit-large/${sample}_ecc/stats.txt - megahit_stats=$(cat ./02_Assembly/megahit-large/${sample}_ecc/stats.txt | cut -f2,4,5,9,15 -d$'\t' | sed '2q;d' | sed 's/\t/,/g') + megahit_stats=$(cat ./02_Assembly/megahit-large/${sample}_ecc/stats.txt | cut -f2,4,5,9,15 -d$'\t' | sed '2q;d') stats.sh ./02_Assembly/idba_ud/${sample}_ecc/scaffold.fa minscaf=1000 format=3 -Xmx${RAM}G ow=t out=./02_Assembly/idba_ud/${sample}_ecc/stats.txt - idba_stats=$(cat ./02_Assembly/idba_ud/${sample}_ecc/stats.txt | cut -f2,4,5,9,15 -d$'\t' | sed '2q;d' | sed 's/\t/,/g') + idba_stats=$(cat ./02_Assembly/idba_ud/${sample}_ecc/stats.txt | cut -f2,4,5,9,15 -d$'\t' | sed '2q;d') + + spades_L50=$(cat ./02_Assembly/metaspades/${sample}_ecc/stats.txt | cut -f9 -d$'\t' | sed '2q;d') + megahit_L50=$(cat ./02_Assembly/megahit-large/${sample}_ecc/stats.txt | cut -f9 -d$'\t' | sed '2q;d') + idba_L50=$(cat ./02_Assembly/idba_ud/${sample}_ecc/stats.txt | cut -f9 -d$'\t' | sed '2q;d') + #Evaluate assemblies with Quast (leave out "-R ref.fa if you don't have a reference) echo Predicting prokaryotic genes in each ${sample} assembly @@ -341,27 +460,27 @@ if [[ "${assemble}" == yes ]]; then idba_counts=$(grep '^>' ./02_Assembly/idba_ud/${sample}_ecc/${sample}_c1k_proteins.faa | wc -l ) echo Determining metagenome content within ${sample} assemblies - # Note that these are reads mapped to contigs > 2,000 bp. + # Note that these are reads mapped to contigs > 1,000 bp. #(i.e. % mapped) by mapping reads by to assembly bbmap.sh in=./01_HQ_Reads/${sample}_HQ.fq.gz ref=./02_Assembly/metaspades/${sample}_ecc/${sample}_c1k.fa nodisk covhist=./02_Assembly/metaspades/${sample}_ecc/covhist.txt covstats=./02_Assembly/metaspades/${sample}_ecc/covstats.txt out=./02_Assembly/metaspades/${sample}_ecc/${sample}_reads_assembled.fq.gz outu=./02_Assembly/metaspades/${sample}_ecc/${sample}_reads_unassembled.fq.gz fast=t ambig=best >> ./02_Assembly/metaspades/${sample}_ecc/map.log 2>&1 - spades_match=$(grep Percent\ mapped: ./02_Assembly/metaspades/${sample}_ecc/map.log | cut -f2 -d$'\t' | tail -1 | tail -1) - spades_cov=$(grep Average\ coverage: ./02_Assembly/metaspades/${sample}_ecc/map.log | cut -f2 -d$'\t' | tail -1 | tail -1) - spades_Pa=$(echo "$spades_match*$spades_cov" | bc) + spades_match=$(grep Percent\ mapped: ./02_Assembly/metaspades/${sample}_ecc/map.log | cut -f2 -d$'\t' | tail -1) + spades_cov=$(grep Average\ coverage: ./02_Assembly/metaspades/${sample}_ecc/map.log | cut -f2 -d$'\t' | tail -1) + spades_Pa=$(echo "$spades_match*$spades_L50" | bc) bbmap.sh in=./01_HQ_Reads/${sample}_HQ.fq.gz ref=./02_Assembly/megahit-large/${sample}_ecc/${sample}_c1k.fa nodisk covhist=./02_Assembly/megahit-large/${sample}_ecc/covhist.txt covstats=./02_Assembly/megahit-large/${sample}_ecc/covstats.txt outm=./02_Assembly/megahit-large/${sample}_ecc/${sample}_reads_assembled.fq.gz outu=./02_Assembly/megahit-large/${sample}_ecc/${sample}_reads_unassembled.fq.gz fast=t ambig=best >> ./02_Assembly/megahit-large/${sample}_ecc/map.log 2>&1 megahit_match=$(grep Percent\ mapped: ./02_Assembly/megahit-large/${sample}_ecc/map.log | cut -f2 -d$'\t' | tail -1) megahit_cov=$(grep Average\ coverage: ./02_Assembly/megahit-large/${sample}_ecc/map.log | cut -f2 -d$'\t' | tail -1) - megahit_Pa=$(echo "$megahit_match*$megahit_cov" | bc) + megahit_Pa=$(echo "$megahit_match*$megahit_L50" | bc) bbmap.sh in=./01_HQ_Reads/${sample}_HQ.fq.gz ref=./02_Assembly/idba_ud/${sample}_ecc/${sample}_c1k.fa nodisk covhist=./02_Assembly/idba_ud/${sample}_ecc/covhist.txt covstats=./02_Assembly/idba_ud/${sample}_ecc/covstats.txt outm=./02_Assembly/idba_ud/${sample}_ecc/${sample}_reads_assembled.fq.gz outu=./02_Assembly/idba_ud/${sample}_ecc/${sample}_reads_unassembled.fq.gz fast=t ambig=best >> ./02_Assembly/idba_ud/${sample}_ecc/map.log 2>&1 idba_match=$(grep Percent\ mapped: ./02_Assembly/idba_ud/${sample}_ecc/map.log | cut -f2 -d$'\t' | tail -1) idba_cov=$(grep Average\ coverage: ./02_Assembly/idba_ud/${sample}_ecc/map.log | cut -f2 -d$'\t' | tail -1) - idba_Pa=$(echo "$idba_match*$idba_cov" | bc) + idba_Pa=$(echo "$idba_match*$idba_L50" | bc) - echo Creating assembly comparision for ${sample} - echo ${sample},${HQ_Reads},metaspades,${spades_stats},${spades_counts},${spades_match},${spades_cov},${spades_Pa} >> ./02_Assembly/stats/${sample}_assembly_stats.csv - echo ${sample},${HQ_Reads},megahit,${megahit_stats},${megahit_counts},${megahit_match},${megahit_cov},${megahit_Pa} >> ./02_Assembly/stats/${sample}_assembly_stats.csv - echo ${sample},${HQ_Reads},idba_ud,${idba_stats},${idba_counts},${idba_match},${idba_cov},${idba_Pa} >> ./02_Assembly/stats/${sample}_assembly_stats.csv + echo Creating assembly summary for ${sample} + echo -e "${sample}\t${HQ_Reads}\tmetaspades\t${spades_stats}\t${spades_counts}\t${spades_match}\t${spades_cov}\t${spades_Pa}" >> ./02_Assembly/stats/${sample}_assembly_summary.tsv + echo -e "${sample}\t${HQ_Reads}\tmegahit\t${megahit_stats}\t${megahit_counts}\t${megahit_match}\t${megahit_cov}\t${megahit_Pa}" >> ./02_Assembly/stats/${sample}_assembly_summary.tsv + echo -e "${sample}\t${HQ_Reads}\tidba_ud\t${idba_stats}\t${idba_counts}\t${idba_match}\t${idba_cov}\t${idba_Pa}" >> ./02_Assembly/stats/${sample}_assembly_summary.tsv fi elif [[ "${HQ_Reads}" == HQ_merged ]] @@ -381,24 +500,29 @@ if [[ "${assemble}" == yes ]]; then else echo Using megahit to assemble ${HQ_Reads} ${sample} reads. - # megahit -r ./01_HQ_Reads/merged_reads/${sample}_HQ_merged.fq.gz --12 ./01_HQ_Reads/merged_reads/${sample}_HQ_unmerged.fq.gz --presets meta-large -m ${RAM} -t ${CPU} -o ./02_Assembly/megahit-large/${sample} >> ./02_Assembly/megahit-large/logs/${sample}.log 2>&1 - megahit -r ./01_HQ_Reads/merged_reads/${sample}_HQ_merged.fq.gz --12 ./01_HQ_Reads/merged_reads/${sample}_HQ_unmerged.fq.gz --k-min 21 --k-max 91 --k-step 10 -m ${RAM} -t ${CPU} -o ~/${sample}_merged >> ./02_Assembly/megahit-large/logs/${sample}.log 2>&1 + megahit -r ./01_HQ_Reads/merged_reads/${sample}_HQ_merged.fq.gz --12 ./01_HQ_Reads/merged_reads/${sample}_HQ_unmerged.fq.gz --presets meta-large -m 0.5 -t ${CPU} -o ./02_Assembly/megahit-large/${sample} >> ./02_Assembly/megahit-large/logs/${sample}.log 2>&1 cp ~/${sample}_merged/final.contigs.fa ./02_Assembly/quast/${sample}/megahit_merged.fasta mkdir -p ./02_Assembly/megahit-large/${sample}_merged/ mv ~/${sample}_merged/ ./02_Assembly/megahit-large/ echo idba_ud does not assemble merged reads and will not be used for comparison. fi - - - if test -f "./02_Assembly/stats/${sample}_assembly_stats.csv"; then : + ######### + ## FIX ## + ######### + ## Fix code so 'No such file or directory' Error does not show + if grep -m 1 -q "${HQ_Reads}" ./02_Assembly/stats/${sample}_assembly_summary.tsv; then : else - echo metagenome_id,reads,assembler,n_contigs,contig_bp,gap_pct,ctg_L50,ctg_max,gene_counts,mapped_pct,avg_cov,assembly_performance >> ./02_Assembly/stats/${sample}_assembly_stats.csv + echo -e "metagenome_id\treads\tassembler\tn_contigs\tcontig_bp\tgap_pct\tctg_L50\tctg_max\tgene_counts\tmapped_pct\tavg_cov\tassembly_performance" >> ./02_Assembly/stats/${sample}_assembly_summary.tsv echo Evaluating ${sample} assemblies with AssemblyStats stats.sh ./02_Assembly/metaspades/${sample}_merged/scaffolds.fasta minscaf=1000 format=3 -Xmx${RAM}G ow=t out=./02_Assembly/metaspades/${sample}_merged/stats.txt - spades_stats=$(cat ./02_Assembly/metaspades/${sample}_merged/stats.txt | cut -f2,4,5,9,15 -d$'\t' | sed '2q;d' | sed 's/\t/,/g') + spades_stats=$(cat ./02_Assembly/metaspades/${sample}_merged/stats.txt | cut -f2,4,5,9,15 -d$'\t' | sed '2q;d') stats.sh ./02_Assembly/megahit-large/${sample}_merged/final.contigs.fa minscaf=1000 format=3 -Xmx${RAM}G ow=t out=./02_Assembly/megahit-large/${sample}_merged/stats.txt - megahit_stats=$(cat ./02_Assembly/megahit-large/${sample}_merged/stats.txt | cut -f2,4,5,9,15 -d$'\t' | sed '2q;d' | sed 's/\t/,/g') + megahit_stats=$(cat ./02_Assembly/megahit-large/${sample}_merged/stats.txt | cut -f2,4,5,9,15 -d$'\t' | sed '2q;d') + + spades_L50=$(cat ./02_Assembly/metaspades/${sample}_merged/stats.txt | cut -f9 -d$'\t' | sed '2q;d') + megahit_L50=$(cat ./02_Assembly/megahit-large/${sample}_meged/stats.txt | cut -f9 -d$'\t' | sed '2q;d') + #Evaluate assemblies with Quast (leave out "-R ref.fa if you don't have a reference) echo Predicting prokaryotic genes in each ${sample} assembly @@ -417,17 +541,17 @@ if [[ "${assemble}" == yes ]]; then bbmap.sh in=./01_HQ_Reads/${sample}_HQ.fq.gz ref=./02_Assembly/metaspades/${sample}_merged/${sample}_c1k.fa nodisk covhist=./02_Assembly/metaspades/${sample}_merged/covhist.txt covstats=./02_Assembly/metaspades/${sample}_merged/covstats.txt out=./02_Assembly/metaspades/${sample}_merged/${sample}_reads_assembled.fq.gz outu=./02_Assembly/metaspades/${sample}_merged/${sample}_reads_unassembled.fq.gz fast=t ambig=best >> ./02_Assembly/metaspades/${sample}_merged/map.log 2>&1 spades_match=$(grep Percent\ mapped: ./02_Assembly/metaspades/${sample}_merged/map.log | cut -f2 -d$'\t' | tail -1) spades_cov=$(grep Average\ coverage: ./02_Assembly/metaspades/${sample}_merged/map.log | cut -f2 -d$'\t' | tail -1) - spades_Pa=$(echo "$spades_match*$spades_cov" | bc) + + spades_Pa=$(echo "$spades_match*$spades_L50" | bc) bbmap.sh in=./01_HQ_Reads/${sample}_HQ.fq.gz ref=./02_Assembly/megahit-large/${sample}_merged/${sample}_c1k.fa nodisk covhist=./02_Assembly/megahit-large/${sample}_merged/covhist.txt covstats=./02_Assembly/megahit-large/${sample}_merged/covstats.txt outm=./02_Assembly/megahit-large/${sample}_merged/${sample}_reads_assembled.fq.gz outu=./02_Assembly/megahit-large/${sample}_merged/${sample}_reads_unassembled.fq.gz fast=t ambig=best >> ./02_Assembly/megahit-large/${sample}_merged/map.log 2>&1 megahit_match=$(grep Percent\ mapped: ./02_Assembly/megahit-large/${sample}_merged/map.log | cut -f2 -d$'\t' | tail -1) megahit_cov=$(grep Average\ coverage: ./02_Assembly/megahit-large/${sample}_merged/map.log | cut -f2 -d$'\t' | tail -1) - metahit_Pa=$(echo "$megahit_match*$megahit_cov" | bc) + metahit_Pa=$(echo "$megahit_match*$megahit_L50" | bc) echo Creating assembly comparision for ${sample} - echo ${sample},${HQ_Reads},metaspades,${spades_stats},${spades_counts},${spades_match},${spades_cov},${spades_Pa} >> ./02_Assembly/stats/${sample}_assembly_stats.csv - echo ${sample},${HQ_Reads},megahit,${megahit_stats},${megahit_counts},${megahit_match},${megahit_cov},${megahit_Pa} >> ./02_Assembly/stats/${sample}_assembly_stats.csv - echo ${sample},${HQ_Reads},idba_ud,${idba_stats},${idba_counts},${idba_match},${idba_cov},NA >> ./02_Assembly/stats/${sample}_assembly_stats.csv + echo -e "${sample}\t${HQ_Reads}\tmetaspades\t${spades_stats}\t${spades_counts}\t${spades_match}\t${spades_cov}\t${spades_Pa}" >> ./02_Assembly/stats/${sample}_assembly_summary.tsv + echo -e "${sample}\t${HQ_Reads}\tmegahit\t${megahit_stats}\t${megahit_counts}\t${megahit_match}\t${megahit_cov}\t${megahit_Pa}" >> ./02_Assembly/stats/${sample}_assembly_summary.tsv fi elif [[ "${HQ_Reads}" == Skip ]]; then : @@ -435,22 +559,32 @@ if [[ "${assemble}" == yes ]]; then ################### ### Assembly QC ### ################### - read -r -p "Would you like to evaluate the ${sample} assemblies? yes or no: " quast || exit 100 + #read -r -p "Would you like to evaluate the ${sample} assemblies? yes or no: " quast || exit 100 if [[ "${quast}" == yes ]]; then - # --- Evaluation --- + if test -f "./02_Assembly/quast/${sample}.log"; then : + else + # --- Evaluation --- echo Evaluating ${sample} assemblies - #statswrapper.sh ./02_Assembly/quast/${sample}/*.fasta minscaf=1000 format=3 -Xmx${RAM}G ow=t out=./02_Assembly/stats/${sample}.txt ~/GitHub/quast/metaquast.py -f -m 1000 -o ./02_Assembly/quast/${sample} ./02_Assembly/quast/${sample}/*.fasta >> ./02_Assembly/quast/${sample}.log 2>&1 + fi + elif [[ "${quast}" == no ]]; then : fi - cut -f2,3,12 -d ',' ./02_Assembly/stats/${sample}_assembly_stats.csv - read -r -p "Analyze the ./02_Assembly/stats/${sample}_assembly_stats.csv and ./02_Assembly/quast/${sample}/report.html and determine which ${sample} assembly is best. Please enter the absoute path to the assembly.fasta file: " best_assembly || exit 100 + + cut -f2,3,5,12 -d$'\t' ./02_Assembly/stats/${sample}_assembly_summary.tsv + ls `pwd`/02_Assembly/quast/${sample}/*.fasta + ## Future, would be nice to also include the paths to each. So don't have to leave the command line. + read -r -p "Analyze the ./02_Assembly/stats/${sample}_assembly_summary.tsv and ./02_Assembly/quast/${sample}/report.html and determine which ${sample} assembly is best. Please enter the absoute path to the assembly.fasta file: " best_assembly || exit 100 + read -r -p "Would you like to estimate the taxonomy for the ${sample} metagenome? yes or no: " taxa_1 || exit 100 + read -r -p "Would you like to analyze the ${sample} assembly using anvi'o? yes or no: " anvio_1 || exit 100 + read -r -p "Would you like to annotate your ${sample} metagenome for biosynthetic gene clusters? yes or no: " smash || exit 100 + read -r -p "What minimum contig length do you want to analyze for biosynthetic gene clusters? or type NA " min_BGC || exit 100 + read -r -p "Would you like to analyze annotated biosynthetic gene clusters for each sample against the MiBIG database? yes or no: " bigscape || exit 100 ################### #### TAXONOMY ##### ################### - read -r -p "Would you like to estimate the taxonomy for the ${sample} metagenome? yes or no: " taxa_1 || exit 100 if [[ "${taxa_1}" == yes ]]; then # #Pick which assembly you like best by comparing assembly_stats.csv # Copy best assembly to 02_Assembly directory @@ -470,7 +604,7 @@ if [[ "${assemble}" == yes ]]; then ### CONDA ##### ############### # Please follow the conda installation instructions at: https://anvio.org/install - read -r -p "Would you like to analyze the ${sample} assembly using anvi'o? yes or no: " anvio_1 || exit 100 + #read -r -p "Would you like to analyze the ${sample} assembly using anvi'o? yes or no: " anvio_1 || exit 100 if [[ "${anvio_1}" == yes ]]; then #Or, try to determine taxonomy on a per-contig basis. If this is not sensitive enough, try BLAST instead. # sendsketch.sh in=${best_assembly} persequence minhits=1 records=4 >> ./02_Assembly/taxonomy/${sample}.log 2>&1 @@ -507,7 +641,8 @@ if [[ "${assemble}" == yes ]]; then echo exporing gene calls to ${sample} contig database anvi-export-gene-calls -c ./03_Anvio/${sample}/${sample}.db --gene-caller prodigal -o ./03_Anvio/${sample}/prodigal/${sample}_AllGeneCalls.txt >> ./03_Anvio/logs/${sample}.log 2>&1 echo running SCG taxonomy on ${sample} - anvi-run-scg-taxonomy -c ./03_Anvio/${sample}/${sample}.db -T ${CPU} >> ./03_Anvio/logs/${sample}.log 2>&1 + anvi-run-scg-taxonomy -c ./03_Anvio/${sample}/${sample}.db -T ${CPU} --all-hits-output-file ./03_Anvio/${sample}/${sample}_hits.txt >> ./03_Anvio/logs/${sample}.log 2>&1 + anvi-estimate-scg-taxonomy -c ./03_Anvio/${sample}/${sample}.db --metagenome-mode --per-scg-output-file ./03_Anvio/${sample}/${sample}-scg.txt #Remove zeros from column 5 e-value and only write significant hits echo creating functional profile on ${sample} @@ -534,9 +669,9 @@ if [[ "${assemble}" == yes ]]; then # Both antismash and bigscape are run in Docker containers. Please visit their websites for installation instructions: #https://docs.antismash.secondarymetabolites.org/install/ #https://nselem.github.io/bigscape-corason/installation/ - read -r -p "Would you like to annotate your ${sample} metagenome for biosynthetic gene clusters? yes or no: " smash || exit 100 + #read -r -p "Would you like to annotate your ${sample} metagenome for biosynthetic gene clusters? yes or no: " smash || exit 100 if [[ "${smash}" == yes ]]; then - read -r -p "What minimum contig length do you want to analyze for biosynthetic gene clusters? " min_BGC || exit 100 + #read -r -p "What minimum contig length do you want to analyze for biosynthetic gene clusters? " min_BGC || exit 100 mkdir -p ./04_Annotations/antismash/${sample}_c${min_BGC} echo Running antismash to annotate ${sample} secondary metabolites @@ -552,33 +687,26 @@ if [[ "${assemble}" == yes ]]; then --genefinding-tool prodigal-m \ --cc-mibig \ --minlength ${min_BGC} \ - -c ${CPU} >> ./${sample}.log 2>&1 + -c ${CPU} >> ./04_Annotations/antismash/${sample}.log 2>&1 || "No BGCs found in ${sample}" && exit 100 - mkdir -p ./04_Annotations/bigscape/c${min_BGC} - - cd ./04_Annotations/antismash/${sample}_c${min_BGC}/${sample}_renamed_c1k - - if test -f "c*.gbk"; then - for genbank_file in $(ls c*.gbk); do - cluster=$(echo ${genbank_file} | cut -f1 -d '_') - echo copying and renaming ${genbank_file} to cluster_${cluster}_${sample}.gbk - cp ${genbank_file} ${project_dir}/04_Annotations/bigscape/c${min_BGC}/cluster_${cluster}_${sample}.gbk - done - else echo No BGC found in ${sample} - fi cd ${project_dir} elif [[ "${smash}" == no ]]; then : fi done + + #read -r -p "Would you like to analyze annotated biosynthetic gene clusters for each sample against the MiBIG database? yes or no: " bigscape || exit 100 + if [[ "${bigscape}" == yes ]]; then + echo Running bigscape on ${project} antismash genbank files + + ~/bin/run_bigscape ./04_Annotations/antismash ./04_Annotations/bigscape --mix --mibig -c ${CPU} >> ./04_Annotations/bigscape/${project}.log 2&>1 + + cd ${project_dir} + + elif [[ "${bigscape}" == no ]]; then : + fi + elif [[ "${assemble}" == no ]]; then : fi -read -r -p "Would you like to analyze annotated biosynthetic gene clusters for each sample against the MiBIG database? yes or no: " bigscape || exit 100 -if [[ "${bigscape}" == yes ]]; then - echo Running bigscape on ${project} antismash genbank files per sample - cd ${project_dir}/04_Annotations/bigscape - ~/bin/run_bigscape.py c${min_BGC} c${min_BGC}_results --mix --mibig -c ${CPU} >> ./${project}.log 2&>1 -elif [[ "${bigscape}" == no ]]; then : -fi ######################################################################################################################################### ############################## BEGIN CO-ASSEMBLY ############################## ######################################################################################################################################### @@ -597,6 +725,10 @@ if [[ "${coassemble}" == yes ]]; then ################ conda activate assembly read -r -p "Which HQ reads dataset do you want to use to co-assemble $project? HQ HQ_ecc HQ_merged Skip: " HQ_Reads || exit 100 + read -r -p "Would you like to evaluate the quality of all the assemblies? yes or no: " quast || exit 100 + + + if [[ "${HQ_Reads}" == HQ ]] then # Check to see if the project reads have already been created. If not, then create them. @@ -619,9 +751,7 @@ if [[ "${coassemble}" == yes ]]; then else #Assemble reads per site using megahit echo Using megahit to assemble ${HQ_Reads} ${project} reads. - megahit --12 ./01_HQ_Reads/${project}_HQ.fq.gz --presets meta-large -m ${RAM} -t ${CPU} -o ~/${project} >> ./02_Assembly/megahit-large/logs/${project}.log 2>&1 - # use different k-mers for shorter reads - #megahit --12 ./01_HQ_Reads/${project}_HQ.fq.gz --k-min 21 --k-max 91 --k-step 10 -m ${RAM} -t ${CPU} -o ~/${project} >> ./02_Assembly/megahit-large/logs/${project}.log 2>&1 + megahit --12 ./01_HQ_Reads/${project}_HQ.fq.gz --presets meta-large -m 0.5 -t ${CPU} -o ~/${project} >> ./02_Assembly/megahit-large/logs/${project}.log 2>&1 cp ~/${project}/final.contigs.fa ./02_Assembly/quast/${project}/megahit.fasta mkdir -p ./02_Assembly/megahit-large/${project} mv ~/${project} ./02_Assembly/megahit-large @@ -645,18 +775,23 @@ if [[ "${coassemble}" == yes ]]; then cp ./02_Assembly/idba_ud/${project}/scaffold.fa ./02_Assembly/quast/${project}/idba.fasta fi - if test -f "./02_Assembly/stats/${project}_assembly_stats.csv"; then : + if grep -m 1 -q "${HQ_Reads}" ./02_Assembly/stats/${project}_assembly_summary.tsv; + then echo ${HQ_Reads} assemblies have already had stats run. else # --- Evaluation --- - echo metagenome_id,reads,assembler,n_contigs,contig_bp,gap_pct,ctg_L50,ctg_max,gene_counts,mapped_pct,avg_cov,assembly_performance >> ./02_Assembly/stats/${project}_assembly_stats.csv + echo -e "metagenome_id\treads\tassembler\tn_contigs\tcontig_bp\tgap_pct\tctg_L50\tctg_max\tgene_counts\tmapped_pct\tavg_cov\tassembly_performance" >> ./02_Assembly/stats/${project}_assembly_summary.tsv echo Evaluating ${project} assemblies with AssemblyStats ##statswrapper.sh ./02_Assembly/quast/${project}/*.fasta minscaf=1000 format=3 -Xmx${RAM}G ow=t out=./02_Assembly/stats/${project}.txt stats.sh ./02_Assembly/metaspades/${project}/scaffolds.fasta minscaf=1000 format=3 -Xmx${RAM}G ow=t out=./02_Assembly/metaspades/${project}/stats.txt - spades_stats=$(cat ./02_Assembly/metaspades/${project}/stats.txt | cut -f2,4,5,9,15 -d$'\t' | sed '2q;d' | sed 's/\t/,/g') + spades_stats=$(cat ./02_Assembly/metaspades/${project}/stats.txt | cut -f2,4,5,9,15 -d$'\t' | sed '2q;d') + spades_L50=$(cat ./02_Assembly/metaspades/${project}/stats.txt | cut -f9 -d$'\t' | sed '2q;d') stats.sh ./02_Assembly/megahit-large/${project}/final.contigs.fa minscaf=1000 format=3 -Xmx${RAM}G ow=t out=./02_Assembly/megahit-large/${project}/stats.txt - megahit_stats=$(cat ./02_Assembly/megahit-large/${project}/stats.txt | cut -f2,4,5,9,15 -d$'\t' | sed '2q;d' | sed 's/\t/,/g') + megahit_stats=$(cat ./02_Assembly/megahit-large/${project}/stats.txt | cut -f2,4,5,9,15 -d$'\t' | sed '2q;d') + megahit_L50=$(cat ./02_Assembly/megahit-large/${project}/stats.txt | cut -f9 -d$'\t' | sed '2q;d') stats.sh ./02_Assembly/idba_ud/${project}/scaffold.fa minscaf=1000 format=3 -Xmx${RAM}G ow=t out=./02_Assembly/idba_ud/${project}/stats.txt - idba_stats=$(cat ./02_Assembly/idba_ud/${project}/stats.txt | cut -f2,4,5,9,15 -d$'\t' | sed '2q;d' | sed 's/\t/,/g') + idba_stats=$(cat ./02_Assembly/idba_ud/${project}/stats.txt | cut -f2,4,5,9,15 -d$'\t' | sed '2q;d') + idba_L50=$(cat ./02_Assembly/idba_ud/${project}/stats.txt | cut -f9 -d$'\t' | sed '2q;d') + #Evaluate assemblies with Quast (leave out "-R ref.fa if you don't have a reference) echo Predicting prokaryotic genes in each ${project} assembly @@ -674,28 +809,34 @@ if [[ "${coassemble}" == yes ]]; then idba_counts=$(grep '^>' ./02_Assembly/idba_ud/${project}/${project}_c1k_proteins.faa | wc -l ) echo Determining metagenome content within ${project} assemblies + # If project reads have not already been concatenated, do so now. + if test -f "./01_HQ_Reads/${project}_HQ.fq.gz"; then : + else + echo Concatenating ${project} HQ reads + cat ./01_HQ_Reads/*_HQ.fq.gz > ./01_HQ_Reads/${project}_HQ.fq.gz + fi # Note that these are reads mapped to contigs > 2,000 bp. #(i.e. % mapped) by mapping reads by to assembly bbmap.sh in=./01_HQ_Reads/${project}_HQ.fq.gz ref=./02_Assembly/metaspades/${project}/${project}_c1k.fa nodisk covhist=./02_Assembly/metaspades/${project}/covhist.txt covstats=./02_Assembly/metaspades/${project}/covstats.txt out=./02_Assembly/metaspades/${project}/${project}_reads_assembled.fq.gz outu=./02_Assembly/metaspades/${project}/${project}_reads_unassembled.fq.gz fast=t ambig=best ow >> ./02_Assembly/metaspades/${project}/map.log 2>&1 - spades_match=$(grep Percent\ mapped: ./02_Assembly/metaspades/${project}/map.log | cut -f2 -d$'\t' | tail -1 | tail -1) - spades_cov=$(grep Average\ coverage: ./02_Assembly/metaspades/${project}/map.log | cut -f2 -d$'\t' | tail -1 | tail -1) - spades_Pa=$(echo "$spades_match*$spades_cov" | bc) + spades_match=$(grep Percent\ mapped: ./02_Assembly/metaspades/${project}/map.log | cut -f2 -d$'\t' | tail -1) + spades_cov=$(grep Average\ coverage: ./02_Assembly/metaspades/${project}/map.log | cut -f2 -d$'\t' | tail -1) + spades_Pa=$(echo "$spades_match*$spades_L50" | bc) bbmap.sh in=./01_HQ_Reads/${project}_HQ.fq.gz ref=./02_Assembly/megahit-large/${project}/${project}_c1k.fa nodisk covhist=./02_Assembly/megahit-large/${project}/covhist.txt covstats=./02_Assembly/megahit-large/${project}/covstats.txt outm=./02_Assembly/megahit-large/${project}/${project}_reads_assembled.fq.gz outu=./02_Assembly/megahit-large/${project}/${project}_reads_unassembled.fq.gz fast=t ambig=best ow >> ./02_Assembly/megahit-large/${project}/map.log 2>&1 - megahit_match=$(grep Percent\ mapped: ./02_Assembly/megahit-large/${project}/map.log | cut -f2 -d$'\t' | tail -1 | tail -1) - megahit_cov=$(grep Average\ coverage: ./02_Assembly/megahit-large/${project}/map.log | cut -f2 -d$'\t' | tail -1 | tail -1) - megahit_Pa=$(echo "$megahit_match*$megahit_cov" | bc) + megahit_match=$(grep Percent\ mapped: ./02_Assembly/megahit-large/${project}/map.log | cut -f2 -d$'\t' | tail -1) + megahit_cov=$(grep Average\ coverage: ./02_Assembly/megahit-large/${project}/map.log | cut -f2 -d$'\t' | tail -1) + megahit_Pa=$(echo "$megahit_match*$megahit_L50" | bc) bbmap.sh in=./01_HQ_Reads/${project}_HQ.fq.gz ref=./02_Assembly/idba_ud/${project}/${project}_c1k.fa nodisk covhist=./02_Assembly/idba_ud/${project}/covhist.txt covstats=./02_Assembly/idba_ud/${project}/covstats.txt outm=./02_Assembly/idba_ud/${project}/${project}_reads_assembled.fq.gz outu=./02_Assembly/idba_ud/${project}/${project}_reads_unassembled.fq.gz fast=t ambig=best ow >> ./02_Assembly/idba_ud/${project}/map.log 2>&1 - idba_match=$(grep Percent\ mapped: ./02_Assembly/idba_ud/${project}/map.log | cut -f2 -d$'\t' | tail -1 | tail -1) - idba_cov=$(grep Average\ coverage: ./02_Assembly/idba_ud/${project}/map.log | cut -f2 -d$'\t' | tail -1 | tail -1) - idba_Pa=$(echo "$idba_match*$idba_cov" | bc) + idba_match=$(grep Percent\ mapped: ./02_Assembly/idba_ud/${project}/map.log | cut -f2 -d$'\t' | tail -1) + idba_cov=$(grep Average\ coverage: ./02_Assembly/idba_ud/${project}/map.log | cut -f2 -d$'\t' | tail -1) + idba_Pa=$(echo "$idba_match*$idba_L50" | bc) echo Creating assembly comparision for ${project} - echo ${project},${HQ_Reads},metaspades,${spades_stats},${spades_counts},${spades_match},${spades_cov},${spades_Pa} >> ./02_Assembly/stats/${project}_assembly_stats.csv - echo ${project},${HQ_Reads},megahit,${megahit_stats},${megahit_counts},${megahit_match},${megahit_cov},${megahit_Pa} >> ./02_Assembly/stats/${project}_assembly_stats.csv - echo ${project},${HQ_Reads},idba_ud,${idba_stats},${idba_counts},${idba_match},${idba_cov},${idba_Pa} >> ./02_Assembly/stats/${project}_assembly_stats.csv + echo -e "${project}\t${HQ_Reads}\tmetaspades\t${spades_stats}\t${spades_counts}\t${spades_match}\t${spades_cov}\t${spades_Pa}" >> ./02_Assembly/stats/${project}_assembly_summary.tsv + echo -e "${project}\t${HQ_Reads}\tmegahit\t${megahit_stats}\t${megahit_counts}\t${megahit_match}\t${megahit_cov}\t${megahit_Pa}" >> ./02_Assembly/stats/${project}_assembly_summary.tsv + echo -e "${project}\t${HQ_Reads}\tidba_ud\t${idba_stats}\t${idba_counts}\t${idba_match}\t${idba_cov}\t${idba_Pa}" >> ./02_Assembly/stats/${project}_assembly_summary.tsv fi elif [[ "${HQ_Reads}" == HQ_ecc ]] then @@ -710,6 +851,7 @@ if [[ "${coassemble}" == yes ]]; then if test -f "./02_Assembly/metaspades/${project}_ecc/scaffolds.fasta"; then : else + echo Using metaspades to assemble ${HQ_Reads} ${project} reads. mkdir -p ./02_Assembly/metaspades/${project}_ecc metaspades.py -k 21,33,55,77,99,127 --12 ./01_HQ_Reads/error_corrected/${project}_HQ_ecc.fq.gz -t ${CPU} -m ${RAM} -o ./02_Assembly/metaspades/${project}_ecc >> ./02_Assembly/metaspades/logs/${project}.log 2>&1 cp ./02_Assembly/metaspades/${project}_ecc/scaffolds.fasta ./02_Assembly/quast/${project}/metaspades_ecc.fasta @@ -720,12 +862,10 @@ if [[ "${coassemble}" == yes ]]; then else #Assemble reads per site using megahit echo Using megahit to assemble ${HQ_Reads} ${project} reads. - - megahit --12 ./01_HQ_Reads/error_corrected/${project}_HQ_ecc.fq.gz --presets meta-large -m ${RAM} -t ${CPU} -o ./02_Assembly/megahit-large/${project} >> ./02_Assembly/megahit-large/logs/${project}.log 2>&1 - #megahit --12 ./01_HQ_Reads/error_corrected/${project}_HQ_ecc.fq.gz --k-min 21 --k-max 91 --k-step 10 -m ${RAM} -t ${CPU} -o ~/${project}_ecc >> ./02_Assembly/megahit-large/logs/${project}.log 2>&1 + # Becaue megahit won't write to an external hard drive, the working directory is being made in the home directory then copied over to the project working directory. + megahit --12 ./01_HQ_Reads/error_corrected/${project}_HQ_ecc.fq.gz --presets meta-large -m 0.5 -t ${CPU} -o ~/${project}_ecc >> ./02_Assembly/megahit-large/logs/${project}.log 2>&1 cp ~/${project}_ecc/final.contigs.fa ./02_Assembly/quast/${project}/megahit_ecc.fasta - mkdir -p ./02_Assembly/megahit-large/${project}_ecc - mv ~/${project}_ecc/ ./02_Assembly/megahit-large + mv ~/${project}_ecc/ ./02_Assembly/megahit-large/ fi if test -f "./01_HQ_Reads/error_corrected/${project}_HQ_ecc.fa"; @@ -747,16 +887,22 @@ if [[ "${coassemble}" == yes ]]; then cp ./02_Assembly/idba_ud/${project}_ecc/scaffold.fa ./02_Assembly/quast/${project}/idba_ecc.fasta fi # --- Evaluation --- - if test -f "./02_Assembly/stats/${project}_assembly_stats.csv"; then : + if grep -m 1 -q "${HQ_Reads}" ./02_Assembly/stats/${project}_assembly_summary.tsv; + then echo ${HQ_Reads} assemblies have already had stats run. else - echo metagenome_id,reads,assembler,n_contigs,contig_bp,gap_pct,ctg_L50,ctg_max,gene_counts,mapped_pct,avg_cov,assembly_performance >> ./02_Assembly/stats/${project}_assembly_stats.csv + echo -e "metagenome_id\treads\tassembler\tn_contigs\tcontig_bp\tgap_pct\tctg_L50\tctg_max\tgene_counts\tmapped_pct\tavg_cov\tassembly_performance" >> ./02_Assembly/stats/${project}_assembly_summary.tsv echo Evaluating ${project} assemblies with AssemblyStats stats.sh ./02_Assembly/metaspades/${project}_ecc/scaffolds.fasta minscaf=1000 format=3 -Xmx${RAM}G ow=t out=./02_Assembly/metaspades/${project}_ecc/stats.txt - spades_stats=$(cat ./02_Assembly/metaspades/${project}_ecc/stats.txt | cut -f2,4,5,9,15 -d$'\t' | sed '2q;d' | sed 's/\t/,/g') + spades_stats=$(cat ./02_Assembly/metaspades/${project}_ecc/stats.txt | cut -f2,4,5,9,15 -d$'\t' | sed '2q;d') stats.sh ./02_Assembly/megahit-large/${project}_ecc/final.contigs.fa minscaf=1000 format=3 -Xmx${RAM}G ow=t out=./02_Assembly/megahit-large/${project}_ecc/stats.txt - megahit_stats=$(cat ./02_Assembly/megahit-large/${project}_ecc/stats.txt | cut -f2,4,5,9,15 -d$'\t' | sed '2q;d' | sed 's/\t/,/g') + megahit_stats=$(cat ./02_Assembly/megahit-large/${project}_ecc/stats.txt | cut -f2,4,5,9,15 -d$'\t' | sed '2q;d') stats.sh ./02_Assembly/idba_ud/${project}_ecc/scaffold.fa minscaf=1000 format=3 -Xmx${RAM}G ow=t out=./02_Assembly/idba_ud/${project}_ecc/stats.txt - idba_stats=$(cat ./02_Assembly/idba_ud/${project}_ecc/stats.txt | cut -f2,4,5,9,15 -d$'\t' | sed '2q;d' | sed 's/\t/,/g') + idba_stats=$(cat ./02_Assembly/idba_ud/${project}_ecc/stats.txt | cut -f2,4,5,9,15 -d$'\t' | sed '2q;d') + + spades_L50=$(cat ./02_Assembly/metaspades/${project}_ecc/stats.txt | cut -f9 -d$'\t' | sed '2q;d') + megahit_L50=$(cat ./02_Assembly/megahit-large/${project}_ecc/stats.txt | cut -f9 -d$'\t' | sed '2q;d') + idba_L50=$(cat ./02_Assembly/idba_ud/${project}_ecc/stats.txt | cut -f9 -d$'\t' | sed '2q;d') + #Evaluate assemblies with Quast (leave out "-R ref.fa if you don't have a reference) echo Predicting prokaryotic genes in each ${project} assembly @@ -774,27 +920,33 @@ if [[ "${coassemble}" == yes ]]; then idba_counts=$(grep '^>' ./02_Assembly/idba_ud/${project}_ecc/${project}_c1k_proteins.faa | wc -l ) echo Determining metagenome content within ${project} assemblies + # If project reads have not already been concatenated, do so now. + if test -f "./01_HQ_Reads/${project}_HQ.fq.gz"; then : + else + echo Concatenating ${project} HQ reads + cat ./01_HQ_Reads/*_HQ.fq.gz > ./01_HQ_Reads/${project}_HQ.fq.gz + fi # Note that these are reads mapped to contigs > 2,000 bp. #(i.e. % mapped) by mapping reads by to assembly bbmap.sh in=./01_HQ_Reads/${project}_HQ.fq.gz ref=./02_Assembly/metaspades/${project}_ecc/${project}_c1k.fa nodisk covhist=./02_Assembly/metaspades/${project}_ecc/covhist.txt covstats=./02_Assembly/metaspades/${project}_ecc/covstats.txt out=./02_Assembly/metaspades/${project}_ecc/${project}_reads_assembled.fq.gz outu=./02_Assembly/metaspades/${project}_ecc/${project}_reads_unassembled.fq.gz fast=t ambig=best >> ./02_Assembly/metaspades/${project}_ecc/map.log 2>&1 - spades_match=$(grep Percent\ mapped: ./02_Assembly/metaspades/${project}_ecc/map.log | cut -f2 -d$'\t' | tail -1 | tail -1) - spades_cov=$(grep Average\ coverage: ./02_Assembly/metaspades/${project}_ecc/map.log | cut -f2 -d$'\t' | tail -1 | tail -1) - spades_Pa=$(echo "$spades_match*$spades_cov" | bc) + spades_match=$(grep Percent\ mapped: ./02_Assembly/metaspades/${project}_ecc/map.log | cut -f2 -d$'\t' | tail -1) + spades_cov=$(grep Average\ coverage: ./02_Assembly/metaspades/${project}_ecc/map.log | cut -f2 -d$'\t' | tail -1) + spades_Pa=$(echo "$spades_match*$spades_L50" | bc) bbmap.sh in=./01_HQ_Reads/${project}_HQ.fq.gz ref=./02_Assembly/megahit-large/${project}_ecc/${project}_c1k.fa nodisk covhist=./02_Assembly/megahit-large/${project}_ecc/covhist.txt covstats=./02_Assembly/megahit-large/${project}_ecc/covstats.txt outm=./02_Assembly/megahit-large/${project}_ecc/${project}_reads_assembled.fq.gz outu=./02_Assembly/megahit-large/${project}_ecc/${project}_reads_unassembled.fq.gz fast=t ambig=best >> ./02_Assembly/megahit-large/${project}_ecc/map.log 2>&1 megahit_match=$(grep Percent\ mapped: ./02_Assembly/megahit-large/${project}_ecc/map.log | cut -f2 -d$'\t' | tail -1) megahit_cov=$(grep Average\ coverage: ./02_Assembly/megahit-large/${project}_ecc/map.log | cut -f2 -d$'\t' | tail -1) - megahit_Pa=$(echo "$megahit_match*$megahit_cov" | bc) + megahit_Pa=$(echo "$megahit_match*$megahit_L50" | bc) bbmap.sh in=./01_HQ_Reads/${project}_HQ.fq.gz ref=./02_Assembly/idba_ud/${project}_ecc/${project}_c1k.fa nodisk covhist=./02_Assembly/idba_ud/${project}_ecc/covhist.txt covstats=./02_Assembly/idba_ud/${project}_ecc/covstats.txt outm=./02_Assembly/idba_ud/${project}_ecc/${project}_reads_assembled.fq.gz outu=./02_Assembly/idba_ud/${project}_ecc/${project}_reads_unassembled.fq.gz fast=t ambig=best >> ./02_Assembly/idba_ud/${project}_ecc/map.log 2>&1 idba_match=$(grep Percent\ mapped: ./02_Assembly/idba_ud/${project}_ecc/map.log | cut -f2 -d$'\t' | tail -1) idba_cov=$(grep Average\ coverage: ./02_Assembly/idba_ud/${project}_ecc/map.log | cut -f2 -d$'\t' | tail -1) - idba_Pa=$(echo "$idba_match*$idba_cov" | bc) + idba_Pa=$(echo "$idba_match*$idba_L50" | bc) echo Creating assembly comparision for ${project} - echo ${project},${HQ_Reads},metaspades,${spades_stats},${spades_counts},${spades_match},${spades_cov},${spades_Pa} >> ./02_Assembly/stats/${project}_assembly_stats.csv - echo ${project},${HQ_Reads},megahit,${megahit_stats},${megahit_counts},${megahit_match},${megahit_cov},${megahit_Pa} >> ./02_Assembly/stats/${project}_assembly_stats.csv - echo ${project},${HQ_Reads},idba_ud,${idba_stats},${idba_counts},${idba_match},${idba_cov},${idba_Pa} >> ./02_Assembly/stats/${project}_assembly_stats.csv + echo -e "${project}\t${HQ_Reads}\tmetaspades\t${spades_stats}\t${spades_counts}\t${spades_match}\t${spades_cov}\t${spades_Pa}" >> ./02_Assembly/stats/${project}_assembly_summary.tsv + echo -e "${project}\t${HQ_Reads}\tmegahit\t${megahit_stats}\t${megahit_counts}\t${megahit_match}\t${megahit_cov}\t${megahit_Pa}" >> ./02_Assembly/stats/${project}_assembly_summary.tsv + echo -e "${project}\t${HQ_Reads}\tidba_ud\t${idba_stats}\t${idba_counts}\t${idba_match}\t${idba_cov}\t${idba_Pa}" >> ./02_Assembly/stats/${project}_assembly_summary.tsv fi elif [[ "${HQ_Reads}" == HQ_merged ]] then @@ -820,22 +972,26 @@ if [[ "${coassemble}" == yes ]]; then then : else echo Using megahit to assemble ${HQ_Reads} ${project} reads. - megahit -r ./01_HQ_Reads/merged_reads/${project}_HQ_merged.fq.gz --12 ./01_HQ_Reads/merged_reads/${project}_HQ_unmerged.fq.gz --presets meta-large -m ${RAM} -t ${CPU} -o ~/${project}_merged >> ./02_Assembly/megahit-large/logs/${project}.log 2>&1 - #megahit -r ./01_HQ_Reads/merged_reads/${project}_HQ_merged.fq.gz --12 ./01_HQ_Reads/merged_reads/${project}_HQ_unmerged.fq.gz --k-min 21 --k-max 91 --k-step 10 -m ${RAM} -t ${CPU} -o ~/${project}_merged >> ./02_Assembly/megahit-large/logs/${project}.log 2>&1 + megahit -r ./01_HQ_Reads/merged_reads/${project}_HQ_merged.fq.gz --12 ./01_HQ_Reads/merged_reads/${project}_HQ_unmerged.fq.gz --presets meta-large -m 0.5 -t ${CPU} -o ~/${project}_merged >> ./02_Assembly/megahit-large/logs/${project}.log 2>&1 cp ~/${project}_merged/final.contigs.fa ./02_Assembly/quast/${project}/megahit_merged.fasta mkdir -p ./02_Assembly/megahit-large/${project}_merged/ mv ~/${project}_merged/ ./02_Assembly/megahit-large/ echo idba_ud does not assemble merged reads and will not be used for comparison. fi - if test -f "./02_Assembly/stats/${project}_assembly_stats.csv"; then : + if grep -m 1 -q "${HQ_Reads}" ./02_Assembly/stats/${project}_assembly_summary.tsv; + then echo ${HQ_Reads} assemblies have already had stats run. else - echo metagenome_id,reads,assembler,n_contigs,contig_bp,gap_pct,ctg_L50,ctg_max,gene_counts,mapped_pct,avg_cov,assembly_performance >> ./02_Assembly/stats/${project}_assembly_stats.csv + echo -e "metagenome_id\treads\tassembler\tn_contigs\tcontig_bp\tgap_pct\tctg_L50\tctg_max\tgene_counts\tmapped_pct\tavg_cov\tassembly_performance" >> ./02_Assembly/stats/${project}_assembly_summary.tsv echo Evaluating ${project} assemblies with AssemblyStats stats.sh ./02_Assembly/metaspades/${project}_merged/scaffolds.fasta minscaf=1000 format=3 -Xmx${RAM}G ow=t out=./02_Assembly/metaspades/${project}_merged/stats.txt - spades_stats=$(cat ./02_Assembly/metaspades/${project}_merged/stats.txt | cut -f2,4,5,9,15 -d$'\t' | sed '2q;d' | sed 's/\t/,/g') + spades_stats=$(cat ./02_Assembly/metaspades/${project}_merged/stats.txt | cut -f2,4,5,9,15 -d$'\t' | sed '2q;d') stats.sh ./02_Assembly/megahit-large/${project}_merged/final.contigs.fa minscaf=1000 format=3 -Xmx${RAM}G ow=t out=./02_Assembly/megahit-large/${project}_merged/stats.txt - megahit_stats=$(cat ./02_Assembly/megahit-large/${project}_merged/stats.txt | cut -f2,4,5,9,15 -d$'\t' | sed '2q;d' | sed 's/\t/,/g') + megahit_stats=$(cat ./02_Assembly/megahit-large/${project}_merged/stats.txt | cut -f2,4,5,9,15 -d$'\t' | sed '2q;d') + + spades_L50=$(cat ./02_Assembly/metaspades/${project}_merged/stats.txt | cut -f9 -d$'\t' | sed '2q;d') + megahit_L50=$(cat ./02_Assembly/megahit-large/${project}_merged/stats.txt | cut -f9 -d$'\t' | sed '2q;d') + #Evaluate assemblies with Quast (leave out "-R ref.fa if you don't have a reference) echo Predicting prokaryotic genes in each ${project} assembly @@ -849,26 +1005,28 @@ if [[ "${coassemble}" == yes ]]; then megahit_counts=$(grep '^>' ./02_Assembly/megahit-large/${project}_merged/${project}_c1k_proteins.faa | wc -l ) echo Determining metagenome content within ${project} assemblies - # Note that these are reads mapped to contigs > 2,000 bp. - #(i.e. % mapped) by mapping reads by to assembly + # If project reads have not already been concatenated, do so now. if test -f "./01_HQ_Reads/${project}_HQ.fq.gz"; then : else + echo Concatenating ${project} HQ reads cat ./01_HQ_Reads/*_HQ.fq.gz > ./01_HQ_Reads/${project}_HQ.fq.gz fi + # Note that these are reads mapped to contigs > 2,000 bp. + #(i.e. % mapped) by mapping reads by to assembly + bbmap.sh in=./01_HQ_Reads/${project}_HQ.fq.gz ref=./02_Assembly/metaspades/${project}_merged/${project}_c1k.fa nodisk covhist=./02_Assembly/metaspades/${project}_merged/covhist.txt covstats=./02_Assembly/metaspades/${project}_merged/covstats.txt out=./02_Assembly/metaspades/${project}_merged/${project}_reads_assembled.fq.gz outu=./02_Assembly/metaspades/${project}_merged/${project}_reads_unassembled.fq.gz fast=t ambig=best >> ./02_Assembly/metaspades/${project}_merged/map.log 2>&1 spades_match=$(grep Percent\ mapped: ./02_Assembly/metaspades/${project}_merged/map.log | cut -f2 -d$'\t' | tail -1) spades_cov=$(grep Average\ coverage: ./02_Assembly/metaspades/${project}_merged/map.log | cut -f2 -d$'\t' | tail -1) - spades_Pa=$(echo "$spades_match*$spades_cov" | bc) + spades_Pa=$(echo "$spades_match*$spades_L50" | bc) bbmap.sh in=./01_HQ_Reads/${project}_HQ.fq.gz ref=./02_Assembly/megahit-large/${project}_merged/${project}_c1k.fa nodisk covhist=./02_Assembly/megahit-large/${project}_merged/covhist.txt covstats=./02_Assembly/megahit-large/${project}_merged/covstats.txt outm=./02_Assembly/megahit-large/${project}_merged/${project}_reads_assembled.fq.gz outu=./02_Assembly/megahit-large/${project}_merged/${project}_reads_unassembled.fq.gz fast=t ambig=best >> ./02_Assembly/megahit-large/${project}_merged/map.log 2>&1 megahit_match=$(grep Percent\ mapped: ./02_Assembly/megahit-large/${project}_merged/map.log | cut -f2 -d$'\t' | tail -1) megahit_cov=$(grep Average\ coverage: ./02_Assembly/megahit-large/${project}_merged/map.log | cut -f2 -d$'\t' | tail -1) - metahit_Pa=$(echo "$megahit_match*$megahit_cov" | bc) + metahit_Pa=$(echo "$megahit_match*$megahit_L50" | bc) echo Creating assembly comparision for ${project} - echo ${project},${HQ_Reads},metaspades,${spades_stats},${spades_counts},${spades_match},${spades_cov},${spades_Pa} >> ./02_Assembly/stats/${project}_assembly_stats.csv - echo ${project},${HQ_Reads},megahit,${megahit_stats},${megahit_counts},${megahit_match},${megahit_cov},${megahit_Pa} >> ./02_Assembly/stats/${project}_assembly_stats.csv - echo ${project},${HQ_Reads},idba_ud,${idba_stats},${idba_counts},${idba_match},${idba_cov},NA >> ./02_Assembly/stats/${project}_assembly_stats.csv + echo -e "${project}\t${HQ_Reads}\tmetaspades\t${spades_stats}\t${spades_counts}\t${spades_match}\t${spades_cov}\t${spades_Pa}" >> ./02_Assembly/stats/${project}_assembly_summary.tsv + echo -e "${project}\t${HQ_Reads}\tmegahit\t${megahit_stats}\t${megahit_counts}\t${megahit_match}\t${megahit_cov}\t${megahit_Pa}" >> ./02_Assembly/stats/${project}_assembly_summary.tsv fi elif [[ "${HQ_Reads}" == Skip ]] then : @@ -876,7 +1034,7 @@ if [[ "${coassemble}" == yes ]]; then ###################### #### ASSEMBLY QC ##### ###################### - read -r -p "Would you like to evaluate the quality of all the assemblies? yes or no: " quast || exit 100 + #read -r -p "Would you like to evaluate the quality of all the assemblies? yes or no: " quast || exit 100 if [[ "${quast}" == yes ]]; then echo Evaluating ${project} assemblies @@ -884,14 +1042,20 @@ if [[ "${coassemble}" == yes ]]; then ~/GitHub/quast/metaquast.py -f -m 1000 -o ./02_Assembly/quast/${project} ./02_Assembly/quast/${project}/*.fasta >> ./02_Assembly/quast/${project}.log 2>&1 elif [[ "${quast}" == no ]]; then : fi - cut -f2,3,12 -d ',' ./02_Assembly/stats/${project}_assembly_stats.csv - + cut -f2,3,5,12 -d$'\t' ./02_Assembly/stats/${project}_assembly_summary.tsv + ls `pwd`/02_Assembly/quast/${project}/*.fasta # Pick which assembly you like best by comparing assembly_stats.csv - read -r -p "Analyze the ./02_Assembly/stats/${project}_assembly_stats.csv and ./02_Assembly/quast/${project}/report.html and determine which ${project} assembly is best. Please enter the absoute path to the assembly.fasta file: " best_assembly || exit 100 + + read -r -p "Analyze the ./02_Assembly/stats/${project}_assembly_summary.tsv and ./02_Assembly/quast/${project}/report.html and determine which ${project} assembly is best. Please enter the absoute path to the assembly.fasta file: " best_assembly || exit 100 + read -r -p "Would you like to estimate the taxonomy of your metagenome? yes or no : " taxa || exit 100 + read -r -p "Would you like to analyze your metagenome using anvi'o? yes or no: " anvio || exit 100 + read -r -p "What minimum contig length do you want to analyze for biosynthetic gene clusters? " min_BGC || exit 100 + read -r -p "Would you like to analyze annotated biosynthetic gene clusters for ${project} against the MiBIG database? yes or no: " bigscape2 || exit 100 + ################### #### TAXONOMY ##### ################### - read -r -p "Would you like to estimate the taxonomy of your metagenome? yes or no : " taxa || exit 100 + #read -r -p "Would you like to estimate the taxonomy of your metagenome? yes or no : " taxa || exit 100 if [[ "${taxa}" == yes ]]; then # For my analysis, 58 of the best assemblies were megahit and 2 were metaspades. mkdir -p ./02_Assembly/taxonomy @@ -905,7 +1069,7 @@ if [[ "${coassemble}" == yes ]]; then ################### ##### ANVI'O ###### ################### - read -r -p "Would you like to analyze your metagenome using anvi'o? yes or no: " anvio || exit 100 + #read -r -p "Would you like to analyze your metagenome using anvi'o? yes or no: " anvio || exit 100 if [[ "${anvio}" == yes ]]; then #Or, try to determine taxonomy on a per-contig basis. If this is not sensitive enough, try BLAST instead. # sendsketch.sh in=${best_assembly} persequence minhits=1 records=4 >> ./02_Assembly/taxonomy/${project}.log 2>&1 @@ -919,66 +1083,77 @@ if [[ "${coassemble}" == yes ]]; then mkdir -p ./03_Anvio/${project}/KOfam mkdir -p ./03_Anvio/${project}/Pfam mkdir -p ./03_Anvio/${project}/prodigal - mkdir -p ./03_Anvio/logs - #Simplify headers as to not cause future anger; make a key so that contigs can be matched up later if needed; length cut off of 500bp; the prefix argument doesn't take hyphens - echo Reformating ${project} fasta file and removing contigs \< 1000bp - anvi-script-reformat-fasta --seq-type NT --simplify-names --prefix ${prefix} -r ./03_Anvio/${project}/${project}_rename_key.txt -l 1000 -o ./03_Anvio/${project}/${project}_renamed_c1k.fa ${best_assembly} >> ./03_Anvio/logs/${project}.log 2>&1 - #Generate a contigs database for each assembly - echo making contigs database for ${project} assembly - anvi-gen-contigs-database -T ${CPU} -f ./03_Anvio/${project}/${project}_renamed_c1k.fa -n ${project} -o ./03_Anvio/${project}/${project}.db >> ./03_Anvio/logs/${project}.log 2>&1 - echo running hmm - anvi-run-hmms -c ./03_Anvio/${project}/${project}.db -T ${CPU} --also-scan-trnas >> ./03_Anvio/logs/${project}.log 2>&1 - echo running ncbi COGS - anvi-run-ncbi-cogs -c ./03_Anvio/${project}/${project}.db -T ${CPU} --sensitive >> ./03_Anvio/logs/${project}.log 2>&1 - echo running KEGG - anvi-run-kegg-kofams -c ./03_Anvio/${project}/${project}.db -T ${CPU} >> ./03_Anvio/logs/${project}.log 2>&1 - echo running pfams - anvi-run-pfams -c ./03_Anvio/${project}/${project}.db -T ${CPU} >> ./03_Anvio/logs/${project}.log 2>&1 - echo exporting functions to contigs database - anvi-export-functions -c ./03_Anvio/${project}/${project}.db -o ./03_Anvio/${project}/COG/${project}_COG20Category.txt --annotation-sources COG20_CATEGORY >> ./03_Anvio/logs/${project}.log 2>&1 - anvi-export-functions -c ./03_Anvio/${project}/${project}.db -o ./03_Anvio/${project}/COG/${project}_COG20Function.txt --annotation-sources COG20_FUNCTION >> ./03_Anvio/logs/${project}.log 2>&1 - anvi-export-functions -c ./03_Anvio/${project}/${project}.db -o ./03_Anvio/${project}/KOfam/${project}_KOfam.txt --annotation-sources KOfam >> ./03_Anvio/logs/${project}.log 2>&1 - anvi-export-functions -c ./03_Anvio/${project}/${project}.db -o ./03_Anvio/${project}/Pfam/${project}_Pfam.txt --annotation-sources Pfam >> ./03_Anvio/logs/${project}.log 2>&1 - echo exporing gene calls to contig database - anvi-export-gene-calls -c ./03_Anvio/${project}/${project}.db --gene-caller prodigal -o ./03_Anvio/${project}/prodigal/${project}_AllGeneCalls.txt >> ./03_Anvio/logs/${project}.log 2>&1 - echo running SCG taxonomy - anvi-run-scg-taxonomy -c ./03_Anvio/${project}/${project}.db -T ${CPU} >> ./03_Anvio/logs/${project}.log 2>&1 - - #Remove zeros from column 5 e-value and only write significant hits - echo creating functional profile on ${project} - awk -F "\t" '{ if(($5 != 0) && ($5 <= .0001)) { print } }' ./03_Anvio/${project}/KOfam/${project}_KOfam.txt >> ./03_Anvio/${project}/KOfam/${project}_KOfamEvalue_filtered.txt - awk -F "\t" '{ if(($5 != 0) && ($5 <= .0001)) { print } }' ./03_Anvio/${project}/Pfam/${project}_Pfam.txt >> ./03_Anvio/${project}/Pfam/${project}_PfamEvalue_filtered.txt - awk -F "\t" '{ if(($5 != 0) && ($5 <= .0001)) { print } }' ./03_Anvio/${project}/COG/${project}_COG20Function.txt >> ./03_Anvio/${project}/COG/${project}_COG20FunctionEvalue_filtered.txt - awk -F "\t" '{ if(($5 != 0) && ($5 <= .0001)) { print } }' ./03_Anvio/${project}/COG/${project}_COG20Category.txt >> ./03_Anvio/${project}/COG/${project}_COG20CategoryEvalue_filtered.txt - #Cut unique values from column 4 function, sort them and then count them - cut -f4 ./03_Anvio/${project}/KOfam/${project}_KOfamEvalue_filtered.txt | sort | uniq -c >> ./03_Anvio/${project}/KOfam/${project}_KOfamCounts.txt - cut -f4 ./03_Anvio/${project}/Pfam/${project}_PfamEvalue_filtered.txt | sort | uniq -c >> ./03_Anvio/${project}/Pfam/${project}_PfamCounts.txt - cut -f4 ./03_Anvio/${project}/COG/${project}_COG20FunctionEvalue_filtered.txt | sort | uniq -c >> ./03_Anvio/${project}/COG/${project}_COG20Function.txt - cut -f4 ./03_Anvio/${project}/COG/${project}_COG20CategoryEvalue_filtered.txt | sort | uniq -c >> ./03_Anvio/${project}/COG/${project}_COG20Category.txt + mkdir -p ./03_Anvio/ + if test -f "./03_Anvio/${project}/${project}_renamed_c1k.fa"; then : + else + #Simplify headers as to not cause future anger; make a key so that contigs can be matched up later if needed; length cut off of 500bp; the prefix argument doesn't take hyphens + echo Reformating ${project} fasta file and removing contigs \< 1000bp + anvi-script-reformat-fasta --seq-type NT --simplify-names --prefix ${prefix} -r ./03_Anvio/${project}/${project}_rename_key.txt -l 1000 -o ./03_Anvio/${project}/${project}_renamed_c1k.fa ${best_assembly} >> ./03_Anvio/logs/${project}.log 2>&1 + #Generate a contigs database for each assembly + echo making contigs database for ${project} assembly + anvi-gen-contigs-database -T ${CPU} -f ./03_Anvio/${project}/${project}_renamed_c1k.fa -n ${project} -o ./03_Anvio/${project}/${project}.db >> ./03_Anvio/logs/${project}.log 2>&1 + echo running hmm + anvi-run-hmms -c ./03_Anvio/${project}/${project}.db -T ${CPU} --also-scan-trnas >> ./03_Anvio/logs/${project}.log 2>&1 + echo running ncbi COGS + anvi-run-ncbi-cogs -c ./03_Anvio/${project}/${project}.db -T ${CPU} --sensitive >> ./03_Anvio/logs/${project}.log 2>&1 + echo running KEGG + anvi-run-kegg-kofams -c ./03_Anvio/${project}/${project}.db -T ${CPU} >> ./03_Anvio/logs/${project}.log 2>&1 + echo running pfams + anvi-run-pfams -c ./03_Anvio/${project}/${project}.db -T ${CPU} >> ./03_Anvio/logs/${project}.log 2>&1 + echo exporting functions to contigs database + anvi-export-functions -c ./03_Anvio/${project}/${project}.db -o ./03_Anvio/${project}/COG/${project}_COG20Category.txt --annotation-sources COG20_CATEGORY >> ./03_Anvio/logs/${project}.log 2>&1 + anvi-export-functions -c ./03_Anvio/${project}/${project}.db -o ./03_Anvio/${project}/COG/${project}_COG20Function.txt --annotation-sources COG20_FUNCTION >> ./03_Anvio/logs/${project}.log 2>&1 + anvi-export-functions -c ./03_Anvio/${project}/${project}.db -o ./03_Anvio/${project}/KOfam/${project}_KOfam.txt --annotation-sources KOfam >> ./03_Anvio/logs/${project}.log 2>&1 + anvi-export-functions -c ./03_Anvio/${project}/${project}.db -o ./03_Anvio/${project}/Pfam/${project}_Pfam.txt --annotation-sources Pfam >> ./03_Anvio/logs/${project}.log 2>&1 + echo exporing gene calls to contig database + anvi-export-gene-calls -c ./03_Anvio/${project}/${project}.db --gene-caller prodigal -o ./03_Anvio/${project}/prodigal/${project}_AllGeneCalls.txt >> ./03_Anvio/logs/${project}.log 2>&1 + echo running SCG taxonomy + anvi-run-scg-taxonomy -c ./03_Anvio/${project}/${project}.db -T ${CPU} --all-hits-output-file ./03_Anvio/${project}/${project}_hits.txt >> ./03_Anvio/logs/${project}.log 2>&1 + anvi-estimate-scg-taxonomy -c ./03_Anvio/${project}/${project}.db --metagenome-mode --per-scg-output-file ./03_Anvio/${project}/${project}-scg.txt + + #Remove zeros from column 5 e-value and only write significant hits + echo creating functional profile on ${project} + awk -F "\t" '{ if(($5 != 0) && ($5 <= .0001)) { print } }' ./03_Anvio/${project}/KOfam/${project}_KOfam.txt >> ./03_Anvio/${project}/KOfam/${project}_KOfamEvalue_filtered.txt + awk -F "\t" '{ if(($5 != 0) && ($5 <= .0001)) { print } }' ./03_Anvio/${project}/Pfam/${project}_Pfam.txt >> ./03_Anvio/${project}/Pfam/${project}_PfamEvalue_filtered.txt + awk -F "\t" '{ if(($5 != 0) && ($5 <= .0001)) { print } }' ./03_Anvio/${project}/COG/${project}_COG20Function.txt >> ./03_Anvio/${project}/COG/${project}_COG20FunctionEvalue_filtered.txt + awk -F "\t" '{ if(($5 != 0) && ($5 <= .0001)) { print } }' ./03_Anvio/${project}/COG/${project}_COG20Category.txt >> ./03_Anvio/${project}/COG/${project}_COG20CategoryEvalue_filtered.txt + #Cut unique values from column 4 function, sort them and then count them + cut -f4 ./03_Anvio/${project}/KOfam/${project}_KOfamEvalue_filtered.txt | sort | uniq -c >> ./03_Anvio/${project}/KOfam/${project}_KOfamCounts.txt + cut -f4 ./03_Anvio/${project}/Pfam/${project}_PfamEvalue_filtered.txt | sort | uniq -c >> ./03_Anvio/${project}/Pfam/${project}_PfamCounts.txt + cut -f4 ./03_Anvio/${project}/COG/${project}_COG20FunctionEvalue_filtered.txt | sort | uniq -c >> ./03_Anvio/${project}/COG/${project}_COG20Function.txt + cut -f4 ./03_Anvio/${project}/COG/${project}_COG20CategoryEvalue_filtered.txt | sort | uniq -c >> ./03_Anvio/${project}/COG/${project}_COG20Category.txt + fi + # Anvi'o is only effienct when analyzing < 20,000 splits. So choose a bp length cutoff so that only <20,000 splits are included read -r -p "Run : sqlite3 ${project_dir}/03_Anvio/${project}/${project}.db 'select count(*) from contigs_basic_info where length > ####; and enter the length cutoff that results in <20,000 splits. Numbers only: " len_cutoff || exit 100 for sample in `cat samples.txt`; do - echo creating bam file for ${sample} HQ reads against the ${project} best assembly. - prefix2=$(echo $sample | sed 's/-/_/g') - mkdir -p ./04_Mapping/${project} - bbmap.sh in=./01_HQ_Reads/${sample}_HQ.fq.gz out=./04_Mapping/${project}/${sample}.sam bamscript=bs.sh outm=./04_Mapping/${project}/${project}_mapped.fq.gz outu=./04_Mapping/${project}/${project}_unmapped.fq.gz ihist=./04_Mapping/${project}/${project}_insertsize.txt interleaved=t pigz=t threads=12 -Xmx${RAM}g >> ./04_Mapping/${project}/${Project}.log 2>&1 - sh bs.sh >> ./04_Mapping/${project}/${Project}.log 2>&1 - rm bs.sh - echo creating anvi profile for ${sample} sample in project ${project} - anvi-profile -i ./04_Mapping/${project}/${sample}_sorted.bam -c ./03_Anvio/${project}/${project}.db -T ${CPU} -M ${len_cutoff} --sample-name ${prefix2} --output-dir ./03_Anvio/${project}/profiles_c${len_cutoff} >> ./03_Anvio/logs/${project}.log 2>&1 + if test -f "./03_Anvio/${project}/profiles_c${len_cutoff}/${sample}/PROFILE.db"; then : + else + echo creating bam file for ${sample} HQ reads against the ${project} best assembly. + prefix2=$(echo $sample | sed 's/-/_/g') + mkdir -p ./04_Mapping/${project} + bbmap.sh in=./01_HQ_Reads/${sample}_HQ.fq.gz ref=./03_Anvio/${project}/${project}_renamed_c1k.fa out=./04_Mapping/${project}/${sample}.sam bamscript=bs.sh outm=./04_Mapping/${project}/${project}_mapped.fq.gz outu=./04_Mapping/${project}/${project}_unmapped.fq.gz ihist=./04_Mapping/${project}/${project}_insertsize.txt interleaved=t pigz=t threads=${CPU} -Xmx${RAM}g ow >> ./04_Mapping/${project}/${project}.log 2>&1 + sh bs.sh >> ./04_Mapping/${project}/${project}.log 2>&1 + rm bs.sh + echo creating anvid\'o profile for ${sample} sample in project ${project} + anvi-profile -i ./04_Mapping/${project}/${sample}_sorted.bam -c ./03_Anvio/${project}/${project}.db -T ${CPU} -M ${len_cutoff} --sample-name ${prefix2} --output-dir ./03_Anvio/${project}/profiles_c${len_cutoff}/${sample} >> ./03_Anvio/logs/${project}.log 2>&1 + fi + done - echo merging project profiles - anvi-merge --enforce-hierarchical-clustering ./03_Anvio/${project}/profiles_c${len_cutoff}/*/PROFILE.db -o ./03_Anvio/${project}/profiles_c${len_cutoff}/merged -c ./03_Anvio/${project}/${project}.db >> ./03_Anvio/logs/${project}.log 2>&1 + if test -f "./03_Anvio/${project}/profiles_c${len_cutoff}/merged/PROFILE.db"; then : + else + echo merging ${project} profiles + anvi-merge --enforce-hierarchical-clustering ./03_Anvio/${project}/profiles_c${len_cutoff}/*/PROFILE.db -o ./03_Anvio/${project}/profiles_c${len_cutoff}/merged -c ./03_Anvio/${project}/${project}.db >> ./03_Anvio/logs/${project}.log 2>&1 + fi - read -r -p "Which binner would you like to use to create Metagenomic Assembled Genomes? concoct metabat2 maxbin2 dastool binsanity " binner | exit 100 + read -r -p "Which binner would you like to use to create Metagenomic Assembled Genomes? concoct metabat2 maxbin2 dastool binsanity : " binner || exit 100 + echo Creating MAGs using ${binner} anvi-cluster-contigs -c ./03_Anvio/${project}/${project}.db -p ./03_Anvio/${project}/profiles_c${len_cutoff}/merged/PROFILE.db -C ${binner} --driver ${binner} -T ${CPU} --log-file ./03_Anvio/${project}/${project}.log --just-do-it >> ./03_Anvio/logs/${project}.log 2>&1 - echo Create an anvi\'o summary - - anvi-summarize -p ./03_Anvio/${project}/profiles_c${len_cutoff}/merged/PROFILE.db -c ./03_Anvio/${project}/${project}.db -C ${binner} -o ./03_Anvio/${project} + anvi-summarize -p ./03_Anvio/${project}/profiles_c${len_cutoff}/merged/PROFILE.db -c ./03_Anvio/${project}/${project}.db -C ${binner} -o ./03_Anvio/${project}/SUMMARY_${binner} conda deactivate elif [[ "${anvio}" == no ]]; then : @@ -988,7 +1163,7 @@ if [[ "${coassemble}" == yes ]]; then ###################### if test -f "./04_Annotations/antismash/${project}_c${min_BGC}/${project}_renamed_c1k/index.html"; then : else - read -r -p "What minimum contig length do you want to analyze for biosynthetic gene clusters? " min_BGC || exit 100 + #read -r -p "What minimum contig length do you want to analyze for biosynthetic gene clusters? " min_BGC || exit 100 mkdir -p ./04_Annotations/antismash/${project}_c${min_BGC} echo Running antismash to annotate ${project} secondary metabolites @@ -1004,32 +1179,22 @@ if [[ "${coassemble}" == yes ]]; then --genefinding-tool prodigal-m \ --cc-mibig \ --minlength ${min_BGC} \ - -c ${CPU} >> ./${project}.log 2>&1 - - mkdir -p ./04_Annotations/bigscape/${project}_c${min_BGC} + -c ${CPU} >> ./${project}.log 2>&1 || "No BGCs found in ${project}" && exit 100 - cd ./04_Annotations/antismash/${project}_c${min_BGC}/${project}_renamed_c1k + cd ${project_dir} + fi + #read -r -p "Would you like to analyze annotated biosynthetic gene clusters for ${project} against the MiBIG database? yes or no: " bigscape2 || exit 100 + if [[ "${bigscape2}" == yes ]]; then echo Running bigscape on ${project} antismash genbank files - - if test -f "c*.gbk"; then - for genbank_file in $(ls c*.gbk); do - cluster=$(echo ${genbank_file} | cut -f1 -d '_') - echo copying and renaming ${genbank_file} to cluster_${cluster}_${project}.gbk - cp ${genbank_file} ${project_dir}/04_Annotations/bigscape/${project}_c${min_BGC}/cluster_${cluster}_${project}.gbk - done - else echo No BGC found in ${project} - fi - - - cd ${project_dir}/04_Annotations/bigscape - - ~/bin/run_bigscape.py ${project}_c${min_BGC} ${project}_c${min_BGC}_results --mix --mibig -c ${CPU} >> ./${project}.log 2&>1 - - cd ${project_dir} + mkdir -p ./04_Annotations/bigscape + ~/bin/run_bigscape ./04_Annotations/antismash ./04_Annotations/bigscape --mix --mibig -c ${CPU} >> ./${project}.log 2&>1 + elif [[ "${bigscape2}" == no ]]; then : fi -# Or else exit + + cd ${project_dir} elif [[ "${coassemble}" == no ]]; then echo Co-Assemblies have not been generated nor analyzed for ${project} fi + echo finished meow diff --git a/taxatarget_environment.yaml b/taxatarget_environment.yaml new file mode 100644 index 0000000..16f3a2b --- /dev/null +++ b/taxatarget_environment.yaml @@ -0,0 +1,46 @@ +name: taxatarget +channels: + - bioconda + - conda-forge + - defaults +dependencies: + - _libgcc_mutex=0.1=conda_forge + - _openmp_mutex=4.5=1_gnu + - bzip2=1.0.8=h7f98852_4 + - ca-certificates=2021.10.8=ha878542_0 + - ld_impl_linux-64=2.36.1=hea4e1c9_2 + - libblas=3.9.0=13_linux64_openblas + - libcblas=3.9.0=13_linux64_openblas + - libffi=3.4.2=h7f98852_5 + - libgcc-ng=11.2.0=h1d223b6_14 + - libgfortran-ng=11.2.0=h69a702a_14 + - libgfortran5=11.2.0=h5c6108e_14 + - libgomp=11.2.0=h1d223b6_14 + - liblapack=3.9.0=13_linux64_openblas + - libnsl=2.0.0=h7f98852_0 + - libopenblas=0.3.18=pthreads_h8fe5266_0 + - libstdcxx-ng=11.2.0=he4da1e4_14 + - libuuid=2.32.1=h7f98852_1000 + - libzlib=1.2.11=h166bdaf_1014 + - ncurses=6.3=h9c3ff4c_0 + - numpy=1.22.3=py310h45f3432_0 + - openssl=3.0.2=h166bdaf_1 + - pandas=1.4.1=py310hb5077e9_0 + - pip=22.0.4=pyhd8ed1ab_0 + - python=3.10.4=h2660328_0_cpython + - python-dateutil=2.8.2=pyhd8ed1ab_0 + - python_abi=3.10=2_cp310 + - pytz=2022.1=pyhd8ed1ab_0 + - readline=8.1=h46c0cb4_0 + - scipy=1.8.0=py310hea5193d_1 + - setuptools=61.2.0=py310hff52083_0 + - six=1.16.0=pyh6c4a22f_0 + - sqlite=3.37.1=h4ff8645_0 + - tk=8.6.12=h27826a3_0 + - tzdata=2022a=h191b570_0 + - wheel=0.37.1=pyhd8ed1ab_0 + - xz=5.2.5=h516909a_1 + - zlib=1.2.11=h166bdaf_1014 + - pip: + - biopython==1.79 +prefix: /home/fch/miniconda3/envs/taxatarget