-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathGenerateVirusMatrix.sh
74 lines (57 loc) · 3.14 KB
/
GenerateVirusMatrix.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
#!/bin/bash
#SBATCH -N 1
#SBATCH -t 12:00:00
#SBATCH --mem=60G
#SBATCH --mail-type=BEGIN,END
#SBATCH [email protected]
#########################################################################
# SET THESE PARAMETERS (N.B. $TMPDIR variable refers to compute node scratch space, adjust this to your system)
# Activate conda environment
eval "$(conda shell.bash hook)"
conda activate hostPredict
# Data. Assumes paired-end fastq.gz, with extensions "_1.fastq.gz" & "_2.fastq.gz"
cp /directory/*fastq.gz $TMPDIR
# Mapping reference transfer
cp /directory/viruses.fas $TMPDIR
reference_file=$TMPDIR/viruses.fas
# Out directory
outdir=/directory/outputs
# Set number of threads to use
thread_num=8
# Set wd to compute node
cd $TMPDIR
#########################################################################
# Start script
# Index reference
bwa index $reference_file
makeblastdb -in $reference_file -dbtype nucl
# Store sample names and read counts for later normalisation
ls -1 *_1.fastq.gz | sed 's/_1.fastq.gz//' > names
while read line; do count_forward=`echo $(zcat ${line}_1.fastq.gz | wc -l)/4|bc`; count_reverse=`echo $(zcat ${line}_2.fastq.gz | wc -l)/4|bc`; echo $(($count_forward + $count_reverse)) >> counts; done < names
paste names counts > temp_data; rm names counts
echo -e 'sample_name\tread_count' > header
cat header temp_data > sample_info; rm header temp_data
# Map reads to reference
for i in *fastq.gz; do bwa mem -t $thread_num $reference_file $i | samtools view -S -F4 - > ${i}.vir.sam; done
# Reassign multi-mapped reads to best hit in sam file
for i in *sam; do touch updated_$i; done
ls -1 *sam | grep -v updated_ > process_files
while read line; do pathoscope ID -alignFile $line -fileType sam -outDir . ; done < process_files; rm pathoid-sam-report.tsv process_files
for i in updated_*; do mv $i ${i#updated_}; done
# QC of reads
for i in *sam; do reformat.sh in=$i out=${i}.fasta; done
for i in *sam.fasta; do blastn -query $i -db $reference_file -outfmt "6 qseqid sacc pident length qlen mismatch gapopen sstart send evalue bitscore staxid stitle" -num_threads $thread_num -word_size 11 -gapopen 5 -gapextend 2 -penalty -3 -reward 2 -dust yes | sort -k1,1 -k11,11nr -k10,10n | sort -u -k1,1 --merge > ${i}.fmt6; done
for i in *fmt6; do awk '$4 > 40' $i | cut -f1 | sort | uniq > $i.hq; done
for i in *sam; do grep -Ff ${i}.fasta.fmt6.hq $i > ${i%sam}hq.sam; done
# Merge paired-end outputs (done late, since identical forward and reverse read names will otherwise be lost)
for i in *_1.fastq.gz.vir.hq.sam; do cat $i ${i%1.fastq.gz*}2.fastq.gz.vir.hq.sam >> ${i%1.fastq.gz*}merged.fastq.gz.vir.hq.sam; done
# Construct data matrix
grep ">" $reference_file | sed 's/>//' > header_lines
while read line; do for i in *merged*hq.sam; do cut -f1-3 $i | grep $line | wc -l >> $line.col; done; done < header_lines
temp_outs=`sed 's/$/.col/' header_lines | tr '\n' '\t'`
paste $temp_outs > temp_matrix; rm *.col
tr '\n' '\t' < header_lines > header; rm header_lines; echo "" >> header
cat header temp_matrix > named_temp_matrix
paste sample_info named_temp_matrix > virus_matrix.txt; rm sample_info *temp_matrix header
# Clearup
mv *merged*hq.sam virus_matrix.txt $outdir