Skip to content

Commit

Permalink
updated BLAST (both are identical. ik, but its temporary)
Browse files Browse the repository at this point in the history
  • Loading branch information
GuilleGorines committed Sep 28, 2023
1 parent 97405f3 commit ed7ea2a
Show file tree
Hide file tree
Showing 2 changed files with 85 additions and 42 deletions.
38 changes: 26 additions & 12 deletions bu_isciii/templates/blast_nt/ANALYSIS/ANALYSIS02_BLAST/lablog
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@ mkdir logs

# Location of assemblies to a variable so it only has to be changed here
LOCATION=../*/*/assembly/*/*
# Other databases:
# /data/bi/references/BLAST_dbs/nt_20211025/nt
BLAST_DATABASE="/data/bi/references/virus/BLAST/all_virus.fasta"

# if there are scaffolds, uncompress the scaffolds in its dir (zcat for decompression)
# if there contigs and no scaffolds, uncompress the contigs as scaffolds in its dir
Expand All @@ -24,38 +27,49 @@ cat ../samples_id.txt | while read in; do
done

# NOTE3: change the -query flag to meet your requirements
cat ../samples_id.txt | xargs -I %% echo "srun --chdir ${scratch_dir} --partition middle_idx --mem 376530M --time 48:00:00 --cpus-per-task 10 --output logs/BLASTN_%%_%j.log --job-name BLASTN_%% blastn -num_threads 10 -db /data/bi/references/BLAST_dbs/nt_20211025/nt -query %%/%%.scaffolds.fa -out %%/%%_blast.tsv -outfmt '6 qseqid stitle std slen qlen qcovs' &" > _01_blast.sh
cat ../samples_id.txt | xargs -I %% echo "srun --chdir ${scratch_dir} --partition middle_idx --mem 200G --time 48:00:00 --cpus-per-task 10 --output logs/BLASTN_%%_%j.log --job-name BLASTN_%% blastn -num_threads 10 -db ${BLAST_DATABASE} -query %%/%%.scaffolds.fa -out %%/%%_blast.tsv -outfmt '6 qseqid stitle qaccver saccver pident length mismatch gaps qstart qend sstart send evalue bitscore slen qlen qcovs' &" > _01_blast.sh

# Filtering criteria:
# %refCovered > 0.7
# ref not a phage (stitle ~! /phage/)
# ref longer than 200 bp (slen > 200)

cat ../samples_id.txt | xargs -I %% echo "awk 'BEGIN{OFS=\"\t\";FS=\"\t\"}{print \$0,\$16/\$6,\$15/\$6}' %%/%%_blast.tsv | awk -v \"samplename=%%\" 'BEGIN{OFS=\"\t\";FS=\"\t\"} \$16 > 200 && \$17 > 0.7 && \$3 !~ /phage/ {print samplename,\$0}' > %%/%%_blast_filt.tsv" > _02_filter_blast.sh
echo -e "echo \"samplename\tcontigname\tstitle\tqaccver\tsaccver\tpident\tlength\tmismatch\tgapopen\tqstart\tqend\tsstart\tsend\tevalue\tbitscore\tslen\tqlen\tqcovs\t%cgAligned\t%refCovered\" > header" > _03_gather_results_add_header.sh
# First awk: create the full table; second awk: filter it
cat ../samples_id.txt | xargs -I %% echo "awk -v \"samplename=%%\" 'BEGIN{OFS=\"\t\";FS=\"\t\"}{print samplename,\$0,(\$6-\$8)/\$16,\$6/\$15}' %%/%%_blast.tsv | awk 'BEGIN{OFS=\"\t\";FS=\"\t\"} \$16 > 200 && \$17 > 0.7 && \$3 !~ /phage/ {print \$0}' > %%/%%_blast_filt.tsv" > _02_filter_blast.sh
echo -e "echo \"samplename\tqseqid\tstitle\tqaccver\tsaccver\tpident\tlength\tmismatch\tgap\tqstart\tqend\tsstart\tsend\tevalue\tbitscore\tref_len\tquery_len\tqcovs\t%queryAligned\t%refCovered\" > header" > _03_gather_results_add_header.sh
echo "cat header */*blast_filt.tsv > all_samples_filtered_BLAST_results.tsv" >> _03_gather_results_add_header.sh
cat ../samples_id.txt | xargs -I %% echo "cat header %%/%%_blast_filt.tsv > tmp; rm %%/%%_blast_filt.tsv; mv tmp %%/%%_blast_filt.tsv" >> _03_gather_results_add_header.sh
echo "rm header" >> _03_gather_results_add_header.sh

# NOTES FOR FILTERING
#
# subject = reference
#
# COLS GENERATED BY US:
# 1: samplename
# 2: contigname
# 3: stitle
# GENERATED BY BLAST
# 2: contigname - qseqid
# 3: stitle
# 4: qaccver
# 5: saccver
# 6: pident
# 7: length
# 7: length (of alignment)
# 8: mismatch
# 9: gapopen
# 9: gaps
# 10: qstart
# 11: qend
# 12: sstart
# 13: send
# 14: evalue
# 15: bitscore
# 16: slen
# 17: qlen
# 16: ref len - slen
# 17: query len - qlen
# 18: qcovs
# 19: %cgAligned
# 20: %refCovered
# MORE INFO: https://www.metagenomics.wiki/tools/blast/blastn-output-format-6
# MORE INFO: https://www.metagenomics.wiki/tools/blast/blastn-output-format-6
# GENERATED BY US:
# 19: %queryAligned: (length-gaps)/qlen (if gaps are not deleted, then this would be bigger than 1 sometimes)
# 20: %refCovered: length/slen

# conda activate 2excel
cat ../samples_id.txt | xargs -I %% echo "srun --chdir ${scratch_dir} --partition short_idx --mem 10G --time 1:00:00 --output logs/2excel_%%.log --job-name 2excel_%% python /scratch/bi/pipelines/utilities/export_excel_from_csv.py --input_file %%/%%_blast_filt.tsv --delimiter '\t' --output_filename %%/%%_blast_filt --it_has_index --it_has_header" > _04_to_excel.sh
echo "srun --chdir ${scratch_dir} --partition short_idx --mem 10G --time 1:00:00 --output logs/2excel_all.log --job-name 2excel_all python /scratch/bi/pipelines/utilities/export_excel_from_csv.py --input_file all_samples_filtered_BLAST_results.tsv --delimiter '\t' --output_filename all_samples_filtered_BLAST_results --it_has_index --it_has_header" >> _04_to_excel.sh
89 changes: 59 additions & 30 deletions bu_isciii/templates/genomeev/ANALYSIS/ANALYSIS04_BLAST/lablog
Original file line number Diff line number Diff line change
Expand Up @@ -3,44 +3,73 @@
scratch_dir=$(echo $PWD | sed "s/\/data\/bi\/scratch_tmp/\/scratch/g")
mkdir logs

# Location of assemblies to a variable so it only has to be changed here
LOCATION=../*/*/assembly/*/*
# Other databases:
# /data/bi/references/BLAST_dbs/nt_20211025/nt
BLAST_DATABASE="/data/bi/references/virus/BLAST/all_virus.fasta"

# if there are scaffolds, uncompress the scaffolds in its dir (zcat for decompression)
# if there contigs and no scaffolds, uncompress the contigs as scaffolds in its dir
echo "Samples that did not generate scaffolds:" > noscaffold.txt
cat ../samples_id.txt | while read in; do
mkdir ${in}
# ls will return 0 if there are no scaffolds file
if [ $(ls ../*/*/assembly/*/*/${in}.scaffolds.fa.gz | wc -l) != 0 ]; then
zcat ../*/*/assembly/*/*/${in}.scaffolds.fa.gz > ${in}/${in}.scaffolds.fa
# NOTE: change extension and location at will
# NOTE2: zcat is only used in case of gzipped files, use a cp or ln -s if needed
if [ $(ls ${LOCATION}/${in}.scaffolds.fa.gz | wc -l) != 0 ]; then
zcat ${LOCATION}/${in}.scaffolds.fa.gz > ${in}/${in}.scaffolds.fa
else
zcat ../*/*/assembly/*/*/${in}.contigs.fa.gz > ${in}/${in}.scaffolds.fa
# Note assemblies that did not make a scaffold
zcat ${LOCATION}/${in}.contigs.fa.gz > ${in}/${in}.scaffolds.fa
echo ${in} >> noscaffold.txt
fi
done

cat ../samples_id.txt | while read in; do echo "srun --chdir ${scratch_dir} --partition middle_idx --cpus-per-task 10 --output logs/BLASTN_${in}_%j.log --job-name BLASTN_${in} blastn -num_threads 10 -db /data/bi/references/virus/BLAST/all_virus.fasta -query ${in}/${in}.scaffolds.fa -out ${in}/${in}.blast.txt -outfmt '6 stitle std slen qlen qcovs' &"; done > _01_blast.sh
cat ../samples_id.txt | while read in; do echo "awk 'BEGIN{OFS=\"\t\";FS=\"\t\"}{print \$0,\$5/\$15,\$5/\$14}' ${in}/${in}.blast.txt | awk 'BEGIN{OFS=\"\t\";FS=\"\t\"} \$15 > 200 && \$1 !~ /phage/ {print \$0}' > ${in}/${in}.blast.filt.txt"; done > _02_filt_blast.sh

echo "echo -e 'stitle\tqaccver\tsaccver\tpident\tlength\tmismatch\tgapopen\tqstart\tqend\tsstart\tsend\tevalue\tbitscore\tslen\tqlen\tqcovs\t%cgAligned\t%refCovered' > header" > _03_add_header.sh

cat ../samples_id.txt | while read in; do echo "cat header ${in}/${in}.blast.filt.txt > ${in}.blast.filt.header.txt"; done >> _03_add_header.sh
echo "rm header" >> _03_add_header.sh

# 1: stitle
# 2: qaccver
# 3: saccver
# 4: pident
# 5: length
# 6: ismatch
# 7: gapopen
# 8: qstart
# 9: qend
# 10: sstart
# 11: send
# 12: evalue
# 13: bitscore
# 14: slen
# 15: qlen
# 16: qcovs
# 17: %cgAligned
# 18: %refCovered
# MORE INFO: https://www.metagenomics.wiki/tools/blast/blastn-output-format-6
# NOTE3: change the -query flag to meet your requirements
cat ../samples_id.txt | xargs -I %% echo "srun --chdir ${scratch_dir} --partition middle_idx --mem 200G --time 48:00:00 --cpus-per-task 10 --output logs/BLASTN_%%_%j.log --job-name BLASTN_%% blastn -num_threads 10 -db ${BLAST_DATABASE} -query %%/%%.scaffolds.fa -out %%/%%_blast.tsv -outfmt '6 qseqid stitle qaccver saccver pident length mismatch gaps qstart qend sstart send evalue bitscore slen qlen qcovs' &" > _01_blast.sh

# Filtering criteria:
# %refCovered > 0.7
# ref not a phage (stitle ~! /phage/)
# ref longer than 200 bp (slen > 200)

# First awk: create the full table; second awk: filter it
cat ../samples_id.txt | xargs -I %% echo "awk -v \"samplename=%%\" 'BEGIN{OFS=\"\t\";FS=\"\t\"}{print samplename,\$0,(\$6-\$8)/\$16,\$6/\$15}' %%/%%_blast.tsv | awk 'BEGIN{OFS=\"\t\";FS=\"\t\"} \$16 > 200 && \$17 > 0.7 && \$3 !~ /phage/ {print \$0}' > %%/%%_blast_filt.tsv" > _02_filter_blast.sh
echo -e "echo \"samplename\tqseqid\tstitle\tqaccver\tsaccver\tpident\tlength\tmismatch\tgap\tqstart\tqend\tsstart\tsend\tevalue\tbitscore\tref_len\tquery_len\tqcovs\t%queryAligned\t%refCovered\" > header" > _03_gather_results_add_header.sh
echo "cat header */*blast_filt.tsv > all_samples_filtered_BLAST_results.tsv" >> _03_gather_results_add_header.sh
cat ../samples_id.txt | xargs -I %% echo "cat header %%/%%_blast_filt.tsv > tmp; rm %%/%%_blast_filt.tsv; mv tmp %%/%%_blast_filt.tsv" >> _03_gather_results_add_header.sh
echo "rm header" >> _03_gather_results_add_header.sh

# NOTES FOR FILTERING
#
# subject = reference
#
# COLS GENERATED BY US:
# 1: samplename
# GENERATED BY BLAST
# 2: contigname - qseqid
# 3: stitle
# 4: qaccver
# 5: saccver
# 6: pident
# 7: length (of alignment)
# 8: mismatch
# 9: gaps
# 10: qstart
# 11: qend
# 12: sstart
# 13: send
# 14: evalue
# 15: bitscore
# 16: ref len - slen
# 17: query len - qlen
# 18: qcovs
# MORE INFO: https://www.metagenomics.wiki/tools/blast/blastn-output-format-6
# GENERATED BY US:
# 19: %queryAligned: (length-gaps)/qlen (if gaps are not deleted, then this would be bigger than 1 sometimes)
# 20: %refCovered: length/slen

# conda activate 2excel
cat ../samples_id.txt | xargs -I %% echo "srun --chdir ${scratch_dir} --partition short_idx --mem 10G --time 1:00:00 --output logs/2excel_%%.log --job-name 2excel_%% python /scratch/bi/pipelines/utilities/export_excel_from_csv.py --input_file %%/%%_blast_filt.tsv --delimiter '\t' --output_filename %%/%%_blast_filt --it_has_index --it_has_header" > _04_to_excel.sh
echo "srun --chdir ${scratch_dir} --partition short_idx --mem 10G --time 1:00:00 --output logs/2excel_all.log --job-name 2excel_all python /scratch/bi/pipelines/utilities/export_excel_from_csv.py --input_file all_samples_filtered_BLAST_results.tsv --delimiter '\t' --output_filename all_samples_filtered_BLAST_results --it_has_index --it_has_header" >> _04_to_excel.sh

0 comments on commit ed7ea2a

Please sign in to comment.