-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlike_bsr.py
150 lines (105 loc) · 5.05 KB
/
like_bsr.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
#usr/bin/python3
# Author B10Comp - Orson Mestanza
# These script find genes in draft assembly genomes and return a Matrix with Blast score ratio for each gene in your multifasta
# You need Python3
# You need BLAST install in your environment
# You need Prodigal install in your environment
# You need Pandas,seaborn, numpy install in python3
import sys
import os
import operator
import numpy as np
import seaborn as sns
import pandas as pd
import shlex, subprocess
import matplotlib.pyplot as plt
def funcion_makeblast(multifasta):
comand_makeblast = "makeblastdb -in %s -dbtype nucl"%multifasta
print("########## Step1: Make a index of data_base ##############")
print(comand_makeblast)
subprocess.call(comand_makeblast, shell=True)
print("\n########## Done data_base ##############")
def do_prodigal(genomes):
subprocess.call("mkdir prodigal_fasta", shell=True)
print("########## Step2: Prediction of CDS using prodigal ##############")
count = 0
for i in os.listdir(genomes):
if i.endswith('.fasta') or i.endswith('.fa') or i.endswith('.fna'):
print(i)
subprocess.check_output("prodigal -c -d prodigal_fasta/pro_%s -i %s%s -m -q"%(i,genomes,i), shell=True)
count += 1
print("\n########## Done %d Predictions ##############\n"%count)
return "prodigal_fasta"
def do_blastn(Predictions,index_blast,cores):
print("########## Step3: Do blastn ##############")
count_blast = 0
for i in os.listdir(Predictions):
if i.endswith('.fasta'):
new_name = i.split(".fasta")[0]
print("blastn -db %s -query %s/%s -outfmt 6 -num_alignments 5 -perc_identity 60 -qcov_hsp_perc 75 -num_threads %d -out %s/%s.txt"%(index_blast,Predictions,i,int(cores),Predictions,new_name))
subprocess.call("blastn -db %s -query %s/%s -outfmt 6 -num_alignments 5 -perc_identity 60 -qcov_hsp_perc 75 -num_threads %d -out %s/%s.txt"%(index_blast,Predictions,i,int(cores),Predictions,new_name), shell=True)
count_blast += 1
print("\n########## Done %d blastn ##############\n"%count_blast)
def extract_bsr_reference(data_base,cores):
order_ref = [] # Order of genes in reference
bsr_ref = [] # blast score ratio of reference
length_ref = []
file = subprocess.check_output("blastn -db %s -query %s -outfmt 6 -num_alignments 5 -perc_identity 100 -qcov_hsp_perc 100 -num_threads %d"%(data_base,data_base,int(cores)),shell=True).decode("utf-8") #, encoding="utf-8"
bsr = file.split("\n")
for i in range(len(bsr)):
new_list = bsr[i].split("\t")
if len(new_list) > 2:
if new_list[0] == new_list[1] and int(new_list[4]) == 0:
order_ref.append(new_list[0])
bsr_ref.append(int(new_list[11]))
length_ref.append(int(new_list[3]))
return order_ref, bsr_ref, length_ref
def do_matrix_bsr(blastn_files,order_gene,bsr_gene,len_gene,output):
#df1 = pd.DataFrame(columns=order_gene)
frames = []
print("########## Step3: Select best hit and do matrix_bsr ##############")
for i in os.listdir(blastn_files):
if i.endswith('.txt'):
index = i.split(".txt")[0]
nombres = index
index = {}
with open("%s/%s"%(blastn_files,i)) as f:
for line in f:
line = line.split("\t")
if not line[1] in index:
if float(line[3])/len_gene[order_gene.index(line[1])]*100 > 90: # Si el alineamiento entre la query y el subjet es mas del 60%
new = line[1]
index[new] = float(line[11])/float(bsr_gene[order_gene.index(line[1])])
else:
if float(line[3])/len_gene[order_gene.index(line[1])]*100 > 90:
if index[line[1]] < float(line[11])/float(bsr_gene[order_gene.index(line[1])]):
index[line[1]] = float(line[11])/float(bsr_gene[order_gene.index(line[1])])
df = pd.DataFrame(index, index=[nombres], columns=order_gene)
df = df.dropna(axis=1)
print(df)
frames.append(df)
index = {}
results = pd.concat(frames)
results = results.replace(np.nan, 0)
results.to_csv("%s.tab"%output, sep='\t', encoding='utf-8')
#print(results)
#sns.heatmap(results, annot=False, xticklabels=1, linewidths=1, linecolor='white')
#plt.show()
def main():
print("Usage:")
print(" python like_bsr.py dir_DraftGenomes Multifasta_for_DB\n\n")
file0 = sys.argv[0]
genomes = sys.argv[1]
data_base = sys.argv[2]
cores = sys.argv[3]
output = sys.argv[4]
funcion_makeblast(data_base) # These function do makeblastdb with multifasta input
nucle_files = do_prodigal(genomes) # These function predicted CDS using prodigal and return the new dir with cds.
do_blastn(nucle_files,data_base,cores) # These functoin make a blastn cds predicted with prodigal vs your genes in multifasta.
order_gen, bsr_gen, len_gen = extract_bsr_reference(data_base,cores) # These function return two arrays. 1. Is array with order of genes of reference. 2 is array with blast score ratio for each gene 3. Is array with the length of each gene.
do_matrix_bsr(nucle_files,order_gen,bsr_gen,len_gen,output)
if __name__ == "__main__":
main()
#Some problems that you can solve in the future
# 1. For example the genes is not in the same position to reference.
# 2. Is necessary plot a heatmap with these data in near future.