From e90b38fe88aeb04f9c53d487dac7c23ec7328ba4 Mon Sep 17 00:00:00 2001 From: GuilleGorines Date: Wed, 27 Sep 2023 17:45:45 +0200 Subject: [PATCH] Added notes to BLAST --- .../blast_nt/ANALYSIS/ANALYSIS02_BLAST/lablog | 38 +++++++++++++++++-- .../genomeev/ANALYSIS/ANALYSIS04_BLAST/lablog | 22 ++++++++++- 2 files changed, 55 insertions(+), 5 deletions(-) diff --git a/bu_isciii/templates/blast_nt/ANALYSIS/ANALYSIS02_BLAST/lablog b/bu_isciii/templates/blast_nt/ANALYSIS/ANALYSIS02_BLAST/lablog index a6974298..0c948449 100644 --- a/bu_isciii/templates/blast_nt/ANALYSIS/ANALYSIS02_BLAST/lablog +++ b/bu_isciii/templates/blast_nt/ANALYSIS/ANALYSIS02_BLAST/lablog @@ -25,7 +25,37 @@ done # NOTE3: change the -query flag to meet your requirements cat ../samples_id.txt | xargs -I %% echo "srun --chdir ${scratch_dir} --partition middle_idx --mem 376530M --time 48:00:00 --cpus-per-task 10 --output logs/BLASTN_%%_%j.log --job-name BLASTN_%% blastn -num_threads 10 -db /data/bi/references/BLAST_dbs/nt_20211025/nt -query %%/%%.scaffolds.fa -out %%/%%_blast.tsv -outfmt '6 qseqid stitle std slen qlen qcovs' &" > _01_blast.sh -cat ../samples_id.txt | xargs -I %% echo "awk 'BEGIN{OFS=\"\t\";FS=\"\t\"}{print \$0,\$6/\$16,\$6/\$15}' %%/%%_blast.tsv | awk -v \"samplename=%%\" 'BEGIN{OFS=\"\t\";FS=\"\t\"} \$15 > 200 && \$17 > 0.7 && \$1 !~ /phage/ {print samplename,\$0}' > %%/%%_blast_filt.tsv" > _02_filter_blast.sh -echo -e "echo \"samplename\tcontigname\tstitle\tqaccver\tsaccver\tpident\tlength\tmismatch\tgapopen\tqstart\tqend\tsstart\tsend\tevalue\tbitscore\tslen\tqlen\tqcovs\t%cgAligned\t%refCovered\" > header" > _03_gather_results.sh -echo "cat header */*blast_filt.tab > all_samples_filtered_BLAST_results.tab" >> _03_gather_results.sh -echo "rm header" >> _03_gather_results.sh + +# Filtering criteria: + # %refCovered > 0.7 + # ref not a phage (stitle ~! /phage/) + # ref longer than 200 bp (slen > 200) + +cat ../samples_id.txt | xargs -I %% echo "awk 'BEGIN{OFS=\"\t\";FS=\"\t\"}{print \$0,\$16/\$6,\$15/\$6}' %%/%%_blast.tsv | awk -v \"samplename=%%\" 'BEGIN{OFS=\"\t\";FS=\"\t\"} \$16 > 200 && \$17 > 0.7 && \$3 !~ /phage/ {print samplename,\$0}' > %%/%%_blast_filt.tsv" > _02_filter_blast.sh +echo -e "echo \"samplename\tcontigname\tstitle\tqaccver\tsaccver\tpident\tlength\tmismatch\tgapopen\tqstart\tqend\tsstart\tsend\tevalue\tbitscore\tslen\tqlen\tqcovs\t%cgAligned\t%refCovered\" > header" > _03_gather_results_add_header.sh +echo "cat header */*blast_filt.tsv > all_samples_filtered_BLAST_results.tsv" >> _03_gather_results_add_header.sh +cat ../samples_id.txt | xargs -I %% echo "cat header %%/%%_blast_filt.tsv > tmp; rm %%/%%_blast_filt.tsv; mv tmp %%/%%_blast_filt.tsv" >> _03_gather_results_add_header.sh +echo "rm header" >> _03_gather_results_add_header.sh + +# NOTES FOR FILTERING +# 1: samplename +# 2: contigname +# 3: stitle +# 4: qaccver +# 5: saccver +# 6: pident +# 7: length +# 8: mismatch +# 9: gapopen +# 10: qstart +# 11: qend +# 12: sstart +# 13: send +# 14: evalue +# 15: bitscore +# 16: slen +# 17: qlen +# 18: qcovs +# 19: %cgAligned +# 20: %refCovered +# MORE INFO: https://www.metagenomics.wiki/tools/blast/blastn-output-format-6 \ No newline at end of file diff --git a/bu_isciii/templates/genomeev/ANALYSIS/ANALYSIS04_BLAST/lablog b/bu_isciii/templates/genomeev/ANALYSIS/ANALYSIS04_BLAST/lablog index 6988e2f7..d3a06b08 100644 --- a/bu_isciii/templates/genomeev/ANALYSIS/ANALYSIS04_BLAST/lablog +++ b/bu_isciii/templates/genomeev/ANALYSIS/ANALYSIS04_BLAST/lablog @@ -23,4 +23,24 @@ cat ../samples_id.txt | while read in; do echo "awk 'BEGIN{OFS=\"\t\";FS=\"\t\"} echo "echo -e 'stitle\tqaccver\tsaccver\tpident\tlength\tmismatch\tgapopen\tqstart\tqend\tsstart\tsend\tevalue\tbitscore\tslen\tqlen\tqcovs\t%cgAligned\t%refCovered' > header" > _03_add_header.sh cat ../samples_id.txt | while read in; do echo "cat header ${in}/${in}.blast.filt.txt > ${in}.blast.filt.header.txt"; done >> _03_add_header.sh -echo "rm header" >> _03_add_header.sh \ No newline at end of file +echo "rm header" >> _03_add_header.sh + +# 1: stitle +# 2: qaccver +# 3: saccver +# 4: pident +# 5: length +# 6: ismatch +# 7: gapopen +# 8: qstart +# 9: qend +# 10: sstart +# 11: send +# 12: evalue +# 13: bitscore +# 14: slen +# 15: qlen +# 16: qcovs +# 17: %cgAligned +# 18: %refCovered +# MORE INFO: https://www.metagenomics.wiki/tools/blast/blastn-output-format-6 \ No newline at end of file