-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathOR_Finder_Step1.bash
151 lines (79 loc) · 4.78 KB
/
OR_Finder_Step1.bash
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
#!/bin/bash
#This script allows to find every functionnals olfactory receptors in a genome. It has to be run with 2 variables : a genome and a OR database
#One can apply several more filters to resulting files like a 7tm check.
#The folder on which scripts are run should contain the file rename_fasta.pl, R_script_numero1.R, R_script_numero2.R, Classification_outgroups.prot and Classification_OR_multifasta.prot
#tblastn of all or fish against the genome of interest
genome=$1
or_database=$2
Unitprot_database=$3
#Remove spaces in scaffold name and transform every nucleotides in uppercase
sed -i "s/ .*//g" $genome
sed -i "s/a/A/g" $genome
sed -i "s/t/T/g" $genome
sed -i "s/g/G/g" $genome
sed -i "s/c/C/g" $genome
#Prepare the genome database
makeblastdb -in $genome -dbtype nucl
#Perform a tblastn of OR database against the genome of interest
tblastn -query $or_database -db $genome -evalue 1e-10 -outfmt 6 -out OR_vs_Genome.blastn -num_threads 30
#Rscript that permit to extract Best-hits that are non-overlapping regions with tblastn match on it
Rscript R_script_numero1.R
#Prepare a genome .fai file
samtools faidx $genome
#Remove the file Putative_or_multifasta_raw.fa if a run has already be done
rm Putative_or_multifasta_raw.fa
#For each best-hits we will extract ORFs that are longer than 750 bp
IFS=$'\n'
for line in `cat Best_hits_filtered.tsv` ; do
echo "$line" > test
scaffold=`cut -f1 test`
initial_start=`cut -f2 test`
initial_stop=`cut -f3 test`
new_3prime=$((initial_start - 1000)) #extend the best-hit 1000bp upstream an downtream to make sure we get the ATG and TAG
new_5prime=$((initial_stop + 1000))
if [ "$new_3prime" -lt '1' ] ; then #If the extension go below 1 then we put it to 1 (scaffold beginning)
new_3prime=1
fi
samtools faidx $genome $scaffold:$new_3prime-$new_5prime > sequence.fa
getorf -sequence sequence.fa -outseq orf_list -minsize 750 -find 3
sed -i "s/>/>$scaffold:/g" orf_list ; sed -i 's/ //g' orf_list
cat orf_list >> Putative_or_multifasta_raw.fa
done
#Rename fasta headers for it to be scaffold:start-stop
sed -i 's/(REVERSESENSE)/_reverse/g' Putative_or_multifasta_raw.fa
grep ">" Putative_or_multifasta_raw.fa | sed 's/>//g' > oldfastaheaders
sed 's/:/ /g' oldfastaheaders | sed 's/-/ /g' | sed 's/_[0-9]\[/ /g' | sed 's/\]//g' | sed 's/_reverse/ reverse/g' > temporary
IFS=$'\n'
for line in `cat temporary`
do
echo "$line" > temp
scaffold=`cut -f1 temp`
if grep -q "reverse" temp ; then
coord_start=`awk 'BEGIN {FS=" "} {sum+=$2+$5-1} END {print sum}' temp`
coord_end=`awk 'BEGIN {FS=" "} {sum+=$2+$4-1} END {print sum}' temp`
else
coord_start=`awk 'BEGIN {FS=" "} {sum+=$2+$4-1} END {print sum}' temp`
coord_end=`awk 'BEGIN {FS=" "} {sum+=$2+$5-1} END {print sum}' temp`
fi
echo "$scaffold-$coord_start-$coord_end" >> newfastaheaders
done
paste -d " " oldfastaheaders newfastaheaders > renaming_file
perl rename_fasta.pl renaming_file Putative_or_multifasta_raw.fa > Putative_or_multifasta.fa
#Due to the -1000/+1000 extension, some orf could be found twice so we remove identical sequences
awk '/^>/ {printf("%s%s\t",(N>0?"\n":""),$0);N++;next;} {printf("%s",$0);} END {printf("\n");}' Putative_or_multifasta.fa | sort -t $'\t' -k1,1 -u | tr "\t" "\n" > Putative_or_multifasta_uniq.fa
#transform the cds fasta in protein fasta
transeq Putative_or_multifasta_uniq.fa Putative_or_multifasta.prot
#remove the _1 added by the programm transeq to protein names
sed -i 's/_1$//g' Putative_or_multifasta.prot
## two step verification : blastp followed by a maximum likelihood tree #
#perform a blastp against uniprot db
blastp -query Putative_or_multifasta.prot -db $Unitprot_database -evalue 1e-5 -outfmt "6 qseqid sseqid sscinames scomnames pident length mismatch gapopen qstart qend sstart send evalue stitle sblastnames sgi sacc" -out blastp_result -max_target_seqs 1 -num_threads 20
#extract all sequences that match the best against on OR
grep -i "olfactory\|odorant" blastp_result | cut -f1 | sort | uniq > OR_from_blast.list
#Extract corresponding protein sequences
for i in `cat OR_from_blast.list` ; do samtools faidx Putative_or_multifasta.prot $i >> Fasta_OR_from_blast.prot ; done
#Put resulting sequences, OR from zebrafish and one outgroup sequence (adra2da gene), align with mafft E-INS and make a maximum likelyhod tree
cat Fasta_OR_from_blast.prot Classification_OR_multifasta.prot Classification_outgroups.prot > Putative_or_plus_known_or_plus_outgroup.prot
mafft --maxiterate 1000 --oldgenafpair --thread -40 Putative_or_plus_known_or_plus_outgroup.prot > Putative_or_plus_known_or_plus_outgroup.prot.aln
iqtree -s Putative_or_plus_known_or_plus_outgroup.prot.aln -st AA -nt 50 -bb 1000 -m JTT+F+I+G4
echo "End of the initial search and time for manual verification of the maximum likelihood tree"