forked from izhangcd/DeepHF
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathprediction_util.py
131 lines (114 loc) · 4.81 KB
/
prediction_util.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
import os
import pandas as pd
import keras
from keras.preprocessing import text
from keras.preprocessing import sequence
from keras.models import *
from feature_util import *
#load models for eSpCas9(1.1) and SpCas9-HF1
dir_path = os.path.dirname( os.path.realpath( __file__ ) )
wt_u6_model_file_path = os.path.join( dir_path, 'models/DeepWt_U6.hd5' )
wt_t7_model_file_path = os.path.join( dir_path, 'models/DeepWt_T7.hd5' )
esp_model_file_path = os.path.join( dir_path, 'models/esp_rnn_model.hd5' )
hf_model_file_path = os.path.join( dir_path, 'models/hf_rnn_model.hd5' )
model_wt_u6 = load_model( wt_u6_model_file_path )
model_wt_t7 = load_model( wt_t7_model_file_path )
model_hf = load_model( hf_model_file_path )
model_esp = load_model( esp_model_file_path )
#get embedding data
def make_data(X):
vectorizer = text.Tokenizer( lower=False, split=" ", num_words=None, char_level=True )
vectorizer.fit_on_texts( X )
# construct a new vocabulary
alphabet = "ATCG"
char_dict = {}
for i, char in enumerate(alphabet):
char_dict[char] = i + 1
word_index = {k:(v+1) for k,v in char_dict.items()}
word_index["PAD"] = 0
word_index["START"] = 1
vectorizer.word_index = word_index.copy()
index_word = {v:k for k,v in word_index.items()}
X = vectorizer.texts_to_sequences(X)
X = [[word_index["START"]] + [w for w in x] for x in X]
X = sequence.pad_sequences(X)
return X
def my_feature(df_model, feature_options):
feature_options['order'] = 1
feature_sets = featurize_data( df_model, feature_options )
inputs, dim, dimsum, feature_names = concatenate_feature_sets( feature_sets )
return inputs, dim, dimsum, feature_names, feature_sets
def get_embedding_data(data, feature_options):
feature_options['order'] = 1
#generating biofeatures
r = my_feature( data, feature_options )
lst_features = [0, 1, 2, 3, -7, -6, -5, -4, -3, -2, -1]
feat_names = list( r[3][i] for i in lst_features )
biofeat = r[0][:, lst_features]
df_biofeat = pd.DataFrame( data=biofeat, columns=feat_names )
# sequence embedding representation
X_1 = make_data( data['21mer'] )
# biological feature reperesentation
X_biofeat = np.array( df_biofeat )
return X_1, X_biofeat
def output_prediction_old(inputs, df, model_type='esp'):
import os
from sklearn.externals import joblib
from sklearn.linear_model import LinearRegression
model = load_model(model_file_path)
Efficiency = model.predict( inputs )
df['gRNA_Seq'] = df['21mer'].apply( lambda x: x[:-1] )
df['Efficiency'] = np.clip( Efficiency, 0, 1 )
df = df.drop( ['21mer'], axis=1 )
df.reset_index( inplace=True )
return df.sort_values( by='Efficiency', ascending=False ).to_dict( orient='records' )
def output_prediction(inputs, df, model_type='esp'):
import os
from sklearn.externals import joblib
from sklearn.linear_model import LinearRegression
#dir_path = os.path.dirname( os.path.realpath( __file__ ) )
#model_file = model_type + '_rnn.hd5'
#model_file_path = os.path.join( dir_path, model_file )
if model_type == 'esp':
model = model_esp
elif model_type == 'wt_u6':
model = model_wt_u6
elif model_type == 'wt_t7':
model = model_wt_t7
elif model_type == 'hf':
model = model_hf
Efficiency = model.predict( inputs )
df['gRNA_Seq'] = df['21mer'].apply( lambda x: x[:-1] )
df['Efficiency'] = np.clip( Efficiency, 0, 1 )
r = model.predict([np.zeros((1, 22)),np.zeros((1,11))])
df = df.drop( ['21mer'], axis=1 )
df.reset_index( inplace=True )
return df.sort_values( by='Efficiency', ascending=False )
def effciency_predict(sequence, model_type='esp'):
sequence = sequence.strip()
import re
# 找出sequence后面20位之后含GG的index
indexs = [m.start() for m in re.finditer( '(?=GG)', sequence ) if m.start() > 20]
gRNA = []
Cut_Pos = []
Strand = []
PAM = []
for i in indexs:
Strand.append( '+' )
gRNA.append( sequence[i - 21:i] )
Cut_Pos.append( i - 4 )
PAM.append( sequence[i - 1:i + 2] )
sequence_complement = str( Seq.Seq( sequence ).reverse_complement() )
index_reverse = [m.start() for m in re.finditer( '(?=GG)', sequence_complement ) if m.start() > 20]
for i in index_reverse:
Strand.append( '-' )
gRNA.append( sequence_complement[i - 21:i] )
Cut_Pos.append( i - 4 )
PAM.append( sequence_complement[i - 1:i + 2] )
pandas.set_option( 'Precision', 5 )
df = pandas.DataFrame( {'Cut_Pos': Cut_Pos,
'Strand': Strand,
'21mer': gRNA,
'PAM': PAM}, columns=['Strand', 'Cut_Pos', '21mer', 'PAM'] )
X,X_biofeat = get_embedding_data(df,feature_options)
return output_prediction( [X,X_biofeat], df, model_type )