-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathbasecaller.py
367 lines (259 loc) · 7.66 KB
/
basecaller.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
# In[1]:
#import numpy as np
#from numpy.random import rand
#import csv
#import h5py
#import signal
#import fast5
#import fastq
#import re
#import matplotlib.pyplot as plt
#import tensorflow as tf
#from tensorflow import keras
# In[12]:
import tensorflow as tf
import keras
import matplotlib.pyplot as plt
import re
import numpy as np
# In[13]:
#import tensorflow as tf
#with tf.device('/gpu:0'):
# a = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[2, 3], name='a')
# b = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[3, 2], name='b')
# c = tf.matmul(a, b)
#with tf.Session() as sess:
# print (sess.run(c))
# In[14]:
"""
this function divides the length of whatever
size the list is by 4 for a 4mer
"""
def BaseFunction(x,y):
return((len(x)//y),y)
# In[15]:
"""
this function divides the length of whatever
size the list is by 40 for a 40mer
"""
def SigFunction(x,y):
return((y),((x//y)))
# In[16]:
"""
this function provides information about the array
"""
def array_inspect(x):
print ("Shape is",(x.shape))
print(("Length is",len(x)))
print(("Dimension is",x.ndim))
print(("Total Size is",x.size))
print(("Type is",x.dtype))
print(("Type Name is",x.dtype.name))
# In[17]:
"""
this function normalizes the raw signal resistances
from each read by dividing by the mean(or std?)
"""
def normalize(x,y):
z=np.divide(x,y)
#np.savetxt("NormalizedSigArray_{}.csv".format(),(z), delimiter=",")
return z
# In[18]:
"""
This cell does the following
1)Imports the bases
2)trims off the new lines and digits
3)converts them to a list
4)converts them to a code of 0123 instead of ATCG
"""
f = open("fasta/sampled_read.fasta","r") #opens the file with the reads
a = f.read()
b = (a.split(">", 12))
base = [re.sub(">|\n|\d", "",str) for str in b]
baseA0 = [re.sub("A","0",str) for str in base]
baseT1 = [re.sub("T","1",str) for str in baseA0]
baseG2 = [re.sub("C","2",str) for str in baseT1]
base_coded = [re.sub("G","3",str) for str in baseG2]
A0=(a.count("A"))
T1=(a.count("T"))
G2=(a.count("G"))
C3=(a.count("C"))
names = ['A', 'T', 'G', 'C']
values = [(A0), (T1), (G2), (C3)]
plt.subplot()
plt.bar(names, values)
plt.suptitle('Nucleotides in all reads')
plt.show()
# In[19]:
"""
The purpose of this code is the following
1)Convert the coded reads to integer form
2)Create an array inside the array with each read as a row
3)Create a 2D matric with the counts of all the possible scenarios
presented in each row and the reads in each column
4)Create a 1D array with the readlengths
"""
kmer=[]
kmercount=[]
reads=[]
#readstring={}
readslength=[]
res=(len(base_coded))
for x in range(res):
l=list(int (i) for i in base_coded[x])
for y in range(1):
i=(l.count(y), l.count(y+1), l.count(y+2), l.count(y+3))
kmercount.append(i)
n=np.transpose(kmercount) # creates a 2D matrix with the bases counts as rows(ATGC) and the reads by columns
kmer = n.view() #Create a view of the array with the same data
d=np.asarray(l)
d.resize((BaseFunction(d,4))) #Enter the list and the desired kmer for the function
v=(str(d))
reads.append(d) # store the reads in an array called reads
c=len(d)
readslength.append(c) #store the amt of kmer reads in readlength=
# In[20]:
"""
The purpose of this code is the following
1) visualize the readlengths
"""
plt.ioff()
for i in range(1):
plt.title("Readlengths")
plt.ylabel('number of bases')
plt.xlabel('read id')
plt.plot(kmer[i])
plt.xticks(np.arange(0, 12))
plt.show()
# In[21]:
"""
The cell does the following:
1) Import the signal level data
2) Normalize it to the mean fof the signal
3) Appends the signal data from each read into a row of an array given the read count
"""
means = []
std = []
sig = []
for x in range(12):
a=np.loadtxt("signal/signal_{}.txt".format(x)) # load the signal
m=a.mean() #obtain the average signal from the read
b=normalize(m,a) # divide each resistance by the average to normalize
c=b.size #obtains the number of resistance signals per read
#print("resist",c)
d=readslength[x] #obtains the kmers count per read
#print("kmer",d)
e=SigFunction(c,d) #figures out the average amount of resistances per kmer
#print("Signal",x,"2D array size={}".format(e))
b.resize(e) # Insert the (#bases,#sig) changes the signal level n signals wide
sig.append(b)
# In[22]:
"""
The goal for this cell is to convert the reads to string
index the unique outputs for the 4mers
and create a new list of the indexed [1:256 or whatever]
reads for the model
"""
arr2=[] # In this code I converted the reads to a concated string of a 4mer
converted=[]
converted2=[]
for i, v in enumerate(reads[11]): #Read 0 is the training data
temp = ''
for w in v:
temp = temp + str(w)
arr2.append(temp)
a=np.unique(arr2) # makes an array of every unique combo in the read for downstream indexing
for x in arr2:
#print(x)
b=enumerate(a)
c=[(i) for i, j in b if a[i]==(x)]
converted.append(c)
for i, v in enumerate(converted): #Read 0 is the training data
temp = ''
#print(v)
for w in v:
#print(w)
temp = temp + str(w)
converted2.append(temp)
y_train=np.int16(converted2)
# In[23]:
"""
I just copied the same code for the testing set read(1)
"""
arr2=[] # In this code I converted the reads to a concated string of a 4mer
converted=[]
converted2=[]
for i, v in enumerate(reads[10]): #Read 0 is the training data
temp = ''
for w in v:
temp = temp + str(w)
arr2.append(temp)
a=np.unique(arr2)# makes an array of every unique combo in the read for downstream indexing
for x in arr2:
#print(x)
b=enumerate(a)
c=[(i) for i, j in b if a[i]==(x)]
converted.append(c)
for i, v in enumerate(converted): #Read 0 is the training data
temp = ''
for w in v:
temp = temp + str(w)
converted2.append(temp)
n=np.unique(converted2)
print((n.size))
y_test=np.int16(converted2)
# In[24]:
x_train = sig[11]
x_test = sig[10]
array_inspect(x_train)
(i,j) = x_train.shape
#
# In[48]:
model=tf.keras.models.Sequential()
model.add(tf.keras.layers.Dense(256, activation=tf.nn.relu)) # uses 128 neurons and is a feed forward rectilinear relu
model.add(tf.keras.layers.Dense(256, activation=tf.nn.relu)) # do the same thig for hidden layer 2
model.add(tf.keras.layers.Dense(256, activation=tf.nn.relu)) # do the same thig for hidden layer 2
model.add(tf.keras.layers.Dense(256, activation=tf.nn.softmax)) # output layer with number of classifications (256) use softmax for output distribution
# In[49]:
model.compile(optimizer='adam',
loss='sparse_categorical_crossentropy',
metrics=['accuracy'])
# In[51]:
#model.fit(x_train, y_train, epochs=30)
history=model.fit(x_train,y_train,
epochs=10, batch_size=i,verbose=1)
# In[28]:
history2=model.fit(x_train,y_train,
epochs=10, batch_size=512,verbose=1)
# In[47]:
val_loss, val_acc = model.evaluate(x_test, y_test)
print(val_loss)
print(val_acc)
# In[ ]:
# In[38]:
predictions = model.predict(x_test)
# In[31]:
history_dict = history.history
history_dict.keys()
acc = history.history['acc']
loss = history.history['loss']
epochs = range(1, len(acc) + 1)
plt.plot(epochs, loss, 'bo', label='Training loss')
plt.title('Validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()
# In[32]:
plt.plot(epochs, acc, 'ro', label='Accuracy')
plt.title('Accuracy loss')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()
# In[35]:
print(np.argmax(predictions[0])) #why is 85 the max number
# In[ ]:
array_inspect(x_test)
array_inspect(y_test)
# In[ ]: