diff --git a/bu_isciii/templates/blast_nt/ANALYSIS/ANALYSIS02_BLAST/lablog b/bu_isciii/templates/blast_nt/ANALYSIS/ANALYSIS02_BLAST/lablog index 0c948449..c9bd56e6 100644 --- a/bu_isciii/templates/blast_nt/ANALYSIS/ANALYSIS02_BLAST/lablog +++ b/bu_isciii/templates/blast_nt/ANALYSIS/ANALYSIS02_BLAST/lablog @@ -5,6 +5,9 @@ mkdir logs # Location of assemblies to a variable so it only has to be changed here LOCATION=../*/*/assembly/*/* +# Other databases: +# /data/bi/references/BLAST_dbs/nt_20211025/nt +BLAST_DATABASE="/data/bi/references/virus/BLAST/all_virus.fasta" # if there are scaffolds, uncompress the scaffolds in its dir (zcat for decompression) # if there contigs and no scaffolds, uncompress the contigs as scaffolds in its dir @@ -24,38 +27,49 @@ cat ../samples_id.txt | while read in; do done # NOTE3: change the -query flag to meet your requirements -cat ../samples_id.txt | xargs -I %% echo "srun --chdir ${scratch_dir} --partition middle_idx --mem 376530M --time 48:00:00 --cpus-per-task 10 --output logs/BLASTN_%%_%j.log --job-name BLASTN_%% blastn -num_threads 10 -db /data/bi/references/BLAST_dbs/nt_20211025/nt -query %%/%%.scaffolds.fa -out %%/%%_blast.tsv -outfmt '6 qseqid stitle std slen qlen qcovs' &" > _01_blast.sh +cat ../samples_id.txt | xargs -I %% echo "srun --chdir ${scratch_dir} --partition middle_idx --mem 200G --time 48:00:00 --cpus-per-task 10 --output logs/BLASTN_%%_%j.log --job-name BLASTN_%% blastn -num_threads 10 -db ${BLAST_DATABASE} -query %%/%%.scaffolds.fa -out %%/%%_blast.tsv -outfmt '6 qseqid stitle qaccver saccver pident length mismatch gaps qstart qend sstart send evalue bitscore slen qlen qcovs' &" > _01_blast.sh # Filtering criteria: # %refCovered > 0.7 # ref not a phage (stitle ~! /phage/) # ref longer than 200 bp (slen > 200) -cat ../samples_id.txt | xargs -I %% echo "awk 'BEGIN{OFS=\"\t\";FS=\"\t\"}{print \$0,\$16/\$6,\$15/\$6}' %%/%%_blast.tsv | awk -v \"samplename=%%\" 'BEGIN{OFS=\"\t\";FS=\"\t\"} \$16 > 200 && \$17 > 0.7 && \$3 !~ /phage/ {print samplename,\$0}' > %%/%%_blast_filt.tsv" > _02_filter_blast.sh -echo -e "echo \"samplename\tcontigname\tstitle\tqaccver\tsaccver\tpident\tlength\tmismatch\tgapopen\tqstart\tqend\tsstart\tsend\tevalue\tbitscore\tslen\tqlen\tqcovs\t%cgAligned\t%refCovered\" > header" > _03_gather_results_add_header.sh +# First awk: create the full table; second awk: filter it +cat ../samples_id.txt | xargs -I %% echo "awk -v \"samplename=%%\" 'BEGIN{OFS=\"\t\";FS=\"\t\"}{print samplename,\$0,(\$6-\$8)/\$16,\$6/\$15}' %%/%%_blast.tsv | awk 'BEGIN{OFS=\"\t\";FS=\"\t\"} \$16 > 200 && \$17 > 0.7 && \$3 !~ /phage/ {print \$0}' > %%/%%_blast_filt.tsv" > _02_filter_blast.sh +echo -e "echo \"samplename\tqseqid\tstitle\tqaccver\tsaccver\tpident\tlength\tmismatch\tgap\tqstart\tqend\tsstart\tsend\tevalue\tbitscore\tref_len\tquery_len\tqcovs\t%queryAligned\t%refCovered\" > header" > _03_gather_results_add_header.sh echo "cat header */*blast_filt.tsv > all_samples_filtered_BLAST_results.tsv" >> _03_gather_results_add_header.sh cat ../samples_id.txt | xargs -I %% echo "cat header %%/%%_blast_filt.tsv > tmp; rm %%/%%_blast_filt.tsv; mv tmp %%/%%_blast_filt.tsv" >> _03_gather_results_add_header.sh echo "rm header" >> _03_gather_results_add_header.sh # NOTES FOR FILTERING +# +# subject = reference +# +# COLS GENERATED BY US: # 1: samplename -# 2: contigname -# 3: stitle +# GENERATED BY BLAST +# 2: contigname - qseqid +# 3: stitle # 4: qaccver # 5: saccver # 6: pident -# 7: length +# 7: length (of alignment) # 8: mismatch -# 9: gapopen +# 9: gaps # 10: qstart # 11: qend # 12: sstart # 13: send # 14: evalue # 15: bitscore -# 16: slen -# 17: qlen +# 16: ref len - slen +# 17: query len - qlen # 18: qcovs -# 19: %cgAligned -# 20: %refCovered -# MORE INFO: https://www.metagenomics.wiki/tools/blast/blastn-output-format-6 \ No newline at end of file +# MORE INFO: https://www.metagenomics.wiki/tools/blast/blastn-output-format-6 +# GENERATED BY US: +# 19: %queryAligned: (length-gaps)/qlen (if gaps are not deleted, then this would be bigger than 1 sometimes) +# 20: %refCovered: length/slen + +# conda activate 2excel +cat ../samples_id.txt | xargs -I %% echo "srun --chdir ${scratch_dir} --partition short_idx --mem 10G --time 1:00:00 --output logs/2excel_%%.log --job-name 2excel_%% python /scratch/bi/pipelines/utilities/export_excel_from_csv.py --input_file %%/%%_blast_filt.tsv --delimiter '\t' --output_filename %%/%%_blast_filt --it_has_index --it_has_header" > _04_to_excel.sh +echo "srun --chdir ${scratch_dir} --partition short_idx --mem 10G --time 1:00:00 --output logs/2excel_all.log --job-name 2excel_all python /scratch/bi/pipelines/utilities/export_excel_from_csv.py --input_file all_samples_filtered_BLAST_results.tsv --delimiter '\t' --output_filename all_samples_filtered_BLAST_results --it_has_index --it_has_header" >> _04_to_excel.sh \ No newline at end of file diff --git a/bu_isciii/templates/genomeev/ANALYSIS/ANALYSIS04_BLAST/lablog b/bu_isciii/templates/genomeev/ANALYSIS/ANALYSIS04_BLAST/lablog index d3a06b08..7910e2cc 100644 --- a/bu_isciii/templates/genomeev/ANALYSIS/ANALYSIS04_BLAST/lablog +++ b/bu_isciii/templates/genomeev/ANALYSIS/ANALYSIS04_BLAST/lablog @@ -3,44 +3,73 @@ scratch_dir=$(echo $PWD | sed "s/\/data\/bi\/scratch_tmp/\/scratch/g") mkdir logs +# Location of assemblies to a variable so it only has to be changed here +LOCATION=../*/*/assembly/*/* +# Other databases: +# /data/bi/references/BLAST_dbs/nt_20211025/nt +BLAST_DATABASE="/data/bi/references/virus/BLAST/all_virus.fasta" + # if there are scaffolds, uncompress the scaffolds in its dir (zcat for decompression) # if there contigs and no scaffolds, uncompress the contigs as scaffolds in its dir echo "Samples that did not generate scaffolds:" > noscaffold.txt cat ../samples_id.txt | while read in; do mkdir ${in} # ls will return 0 if there are no scaffolds file - if [ $(ls ../*/*/assembly/*/*/${in}.scaffolds.fa.gz | wc -l) != 0 ]; then - zcat ../*/*/assembly/*/*/${in}.scaffolds.fa.gz > ${in}/${in}.scaffolds.fa + # NOTE: change extension and location at will + # NOTE2: zcat is only used in case of gzipped files, use a cp or ln -s if needed + if [ $(ls ${LOCATION}/${in}.scaffolds.fa.gz | wc -l) != 0 ]; then + zcat ${LOCATION}/${in}.scaffolds.fa.gz > ${in}/${in}.scaffolds.fa else - zcat ../*/*/assembly/*/*/${in}.contigs.fa.gz > ${in}/${in}.scaffolds.fa + # Note assemblies that did not make a scaffold + zcat ${LOCATION}/${in}.contigs.fa.gz > ${in}/${in}.scaffolds.fa echo ${in} >> noscaffold.txt fi done -cat ../samples_id.txt | while read in; do echo "srun --chdir ${scratch_dir} --partition middle_idx --cpus-per-task 10 --output logs/BLASTN_${in}_%j.log --job-name BLASTN_${in} blastn -num_threads 10 -db /data/bi/references/virus/BLAST/all_virus.fasta -query ${in}/${in}.scaffolds.fa -out ${in}/${in}.blast.txt -outfmt '6 stitle std slen qlen qcovs' &"; done > _01_blast.sh -cat ../samples_id.txt | while read in; do echo "awk 'BEGIN{OFS=\"\t\";FS=\"\t\"}{print \$0,\$5/\$15,\$5/\$14}' ${in}/${in}.blast.txt | awk 'BEGIN{OFS=\"\t\";FS=\"\t\"} \$15 > 200 && \$1 !~ /phage/ {print \$0}' > ${in}/${in}.blast.filt.txt"; done > _02_filt_blast.sh - -echo "echo -e 'stitle\tqaccver\tsaccver\tpident\tlength\tmismatch\tgapopen\tqstart\tqend\tsstart\tsend\tevalue\tbitscore\tslen\tqlen\tqcovs\t%cgAligned\t%refCovered' > header" > _03_add_header.sh - -cat ../samples_id.txt | while read in; do echo "cat header ${in}/${in}.blast.filt.txt > ${in}.blast.filt.header.txt"; done >> _03_add_header.sh -echo "rm header" >> _03_add_header.sh - -# 1: stitle -# 2: qaccver -# 3: saccver -# 4: pident -# 5: length -# 6: ismatch -# 7: gapopen -# 8: qstart -# 9: qend -# 10: sstart -# 11: send -# 12: evalue -# 13: bitscore -# 14: slen -# 15: qlen -# 16: qcovs -# 17: %cgAligned -# 18: %refCovered -# MORE INFO: https://www.metagenomics.wiki/tools/blast/blastn-output-format-6 \ No newline at end of file +# NOTE3: change the -query flag to meet your requirements +cat ../samples_id.txt | xargs -I %% echo "srun --chdir ${scratch_dir} --partition middle_idx --mem 200G --time 48:00:00 --cpus-per-task 10 --output logs/BLASTN_%%_%j.log --job-name BLASTN_%% blastn -num_threads 10 -db ${BLAST_DATABASE} -query %%/%%.scaffolds.fa -out %%/%%_blast.tsv -outfmt '6 qseqid stitle qaccver saccver pident length mismatch gaps qstart qend sstart send evalue bitscore slen qlen qcovs' &" > _01_blast.sh + +# Filtering criteria: + # %refCovered > 0.7 + # ref not a phage (stitle ~! /phage/) + # ref longer than 200 bp (slen > 200) + +# First awk: create the full table; second awk: filter it +cat ../samples_id.txt | xargs -I %% echo "awk -v \"samplename=%%\" 'BEGIN{OFS=\"\t\";FS=\"\t\"}{print samplename,\$0,(\$6-\$8)/\$16,\$6/\$15}' %%/%%_blast.tsv | awk 'BEGIN{OFS=\"\t\";FS=\"\t\"} \$16 > 200 && \$17 > 0.7 && \$3 !~ /phage/ {print \$0}' > %%/%%_blast_filt.tsv" > _02_filter_blast.sh +echo -e "echo \"samplename\tqseqid\tstitle\tqaccver\tsaccver\tpident\tlength\tmismatch\tgap\tqstart\tqend\tsstart\tsend\tevalue\tbitscore\tref_len\tquery_len\tqcovs\t%queryAligned\t%refCovered\" > header" > _03_gather_results_add_header.sh +echo "cat header */*blast_filt.tsv > all_samples_filtered_BLAST_results.tsv" >> _03_gather_results_add_header.sh +cat ../samples_id.txt | xargs -I %% echo "cat header %%/%%_blast_filt.tsv > tmp; rm %%/%%_blast_filt.tsv; mv tmp %%/%%_blast_filt.tsv" >> _03_gather_results_add_header.sh +echo "rm header" >> _03_gather_results_add_header.sh + +# NOTES FOR FILTERING +# +# subject = reference +# +# COLS GENERATED BY US: +# 1: samplename +# GENERATED BY BLAST +# 2: contigname - qseqid +# 3: stitle +# 4: qaccver +# 5: saccver +# 6: pident +# 7: length (of alignment) +# 8: mismatch +# 9: gaps +# 10: qstart +# 11: qend +# 12: sstart +# 13: send +# 14: evalue +# 15: bitscore +# 16: ref len - slen +# 17: query len - qlen +# 18: qcovs +# MORE INFO: https://www.metagenomics.wiki/tools/blast/blastn-output-format-6 +# GENERATED BY US: +# 19: %queryAligned: (length-gaps)/qlen (if gaps are not deleted, then this would be bigger than 1 sometimes) +# 20: %refCovered: length/slen + +# conda activate 2excel +cat ../samples_id.txt | xargs -I %% echo "srun --chdir ${scratch_dir} --partition short_idx --mem 10G --time 1:00:00 --output logs/2excel_%%.log --job-name 2excel_%% python /scratch/bi/pipelines/utilities/export_excel_from_csv.py --input_file %%/%%_blast_filt.tsv --delimiter '\t' --output_filename %%/%%_blast_filt --it_has_index --it_has_header" > _04_to_excel.sh +echo "srun --chdir ${scratch_dir} --partition short_idx --mem 10G --time 1:00:00 --output logs/2excel_all.log --job-name 2excel_all python /scratch/bi/pipelines/utilities/export_excel_from_csv.py --input_file all_samples_filtered_BLAST_results.tsv --delimiter '\t' --output_filename all_samples_filtered_BLAST_results --it_has_index --it_has_header" >> _04_to_excel.sh