-
Notifications
You must be signed in to change notification settings - Fork 0
/
breast-cancer.py
338 lines (247 loc) · 9.92 KB
/
breast-cancer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
from log_regression2 import log_regression
from NaiveBayes3 import NaiveBayes
from cross_validation2 import cross_validation
import separate
# dictionary of categories: contains a dictionary for each feature, mapping an integer
# to each category
categories = {}
# Makes all the inputs of the dataset numerical
def transform(data):
for j in range(len(data[0])):
category_number = 0 # keeps track of the category number in the feature
for i in range(len(data)):
# Ignore features that are already numerical
if (isinstance(data[i][j], int) or isinstance(data[i][j], float)):
break
# Create a dictionary for a feature if it does not exist
if j not in categories:
categories[j] = {}
# Add a numerical value to each category
if data[i][j] not in categories[j]:
categories[j][data[i][j]] = category_number
category_number += 1
data[i][j] = categories[j][data[i][j]]
# Makes data one-hot encoded
def oneHot(data):
one_hot = []
for i in range(len(data)):
one_hot.append(list())
for j in range(len(data[0])):
# Just copy non-categorical features
if (j+1) not in categories:
one_hot[i].append(data[i][j])
# Make categorical features one-hot encoded
else:
# Array of 0s of the length of the categories in that feature
temp = [0]*len(categories[j+1])
# Put 1 to the correct category
temp[data[i][j]] = 1
one_hot[i].extend(temp)
one_hot = np.array(one_hot)
return one_hot
data = pd.read_csv("breast-cancer.csv", header=None)
# Transform data in numpy arrays
data = data.values
data_temp = np.zeros(data.shape, dtype = 'O')
row = 0
# Detect oddities
for i in range(len(data)):
# Test for missing data (row length)
if len(data[0]) != len(data[i]):
print("values on row ",i, " are missing")
# Test for malformed features
include = True # Only include rows that don't have missing features (?)
for j in range(len(data[0])):
# remove leading/trailing spaces
if isinstance(data[i][j], str):
data[i][j] = data[i][j].strip()
# remove instances with missing data
if data[i][j] == "?":
include = False
break
# Only include rows that don't have missing features (?)
if include == True:
data_temp[row,:] = data[i,:]
row += 1
data = data_temp[0:row]
data_raw = data.copy()
# Transform data into numerical values
transform(data)
res = np.zeros(len(data))
# Transform targets in a binary representation
for i in range(len(data)):
res[i] = data[i][0]
# if data[i][len(data[0])-1] == ">50K":
# res[i]=1
# elif data[i][len(data[0])-1] == "<=50K":
# res[i]=0
# delete target from data (last column from data)
data = np.delete(data, 0, 1)
# Make the input categories one_hot encoded
one_hot = oneHot(data)
#print(data)
# Histogram of the targets
plt.figure(1)
#plt.hist(res)
plt.hist([res[np.argwhere(res == 0)], res[np.argwhere(res == 1)]], label=['neg', 'pos'])
plt.legend(loc='upper right')
plt.title("Distribution of the positive vs negative classes")
plt.show()
# Note: there are no numerical features in this dataset
# Distributions of some categorical features (feature columns 0,1,2,3 were considered)
f = (0,1,2,3)
pos = np.argwhere(res == 1)
neg = np.argwhere(res == 0)
# matrices (feature, data point) - separation between positive and negative features
pos_features = np.zeros((4,len(pos)))
neg_features = np.zeros((4,len(neg)))
for i in range(4):
neg_features[i,:] = np.squeeze(data[neg, f[i]])
pos_features[i,:] = np.squeeze(data[pos, f[i]])
plt.figure(2)
for i in range(4):
plt.subplot(2,2,i+1)
# Set bin boundaries by the minimum and maximum values of the features
bins = np.linspace(min(min(neg_features[i,:]), min(pos_features[i,:])),
max(max(neg_features[i,:]), max(pos_features[i,:])), 30)
# Plot the histogram of the positive and negative features
plt.hist([neg_features[i,:], pos_features[i,:]], bins, label=['neg', 'pos'])
plt.legend(loc='upper right')
plt.title("Distribution of feature #" + str(f[i]))
plt.show()
# This dataset does not have numerical features, so correlation between features is not helpful
# Correlation between some numerical features (feature columns 2,3,4,5 were considered)
#plt.figure(3)
#for i in range(4):
#plt.subplot(2,2,i+1)
## Correlation coefficients
#r_neg = np.corrcoef(neg_features[i,:], neg_features[(i+1)%4,:])
#r_pos = np.corrcoef(pos_features[i,:], pos_features[(i+1)%4,:])
## Labels for the legend
#lbl_neg = "r_neg = " + str(round(r_neg[0,1],4))
#lbl_pos = "r_pos = " + str(round(r_pos[0,1],4))
#plt.scatter(neg_features[i,:], neg_features[(i+1)%4,:], label=lbl_neg)
#plt.scatter(pos_features[i,:], pos_features[(i+1)%4,:], label=lbl_pos)
#plt.legend(loc='upper right')
#plt.title("Correlation between feature #" + str(f[i]) + " and #" + str(f[(i+1)%4]))
#plt.show()
# Final data variables X and target variables Y
X = np.array(one_hot)
Y = np.array(res)
##fit log model
#log_model = log_regression(0.001, 20000)
#X = log_model.bias(X) # add bias column
## Separate training and testing sets
#X_train, Y_train, X_test, Y_test = separate.separate(X,Y)
## train the data
#fit_iono = log_model.fit(X_train,Y_train)
## test data
#pre = log_model.predict(X_test,fit_iono)
#acc = log_model.evaluate_acc(pre,Y_test)
#print(acc)
## Cross validation
#validation = cross_validation(5)
#score = validation.evaluate_log(X_train,Y_train)
#print(score)
#print("Naive Bayes:")
## fit naive bayes
#bayes_model = NaiveBayes()
#fit_bayes = bayes_model.fit(X_train,Y_train)
#pre = bayes_model.predict(X_test)
##acc = log_model.evaluate_acc(pre,Y_test)
#acc = bayes_model.evaluate_acc(pre,Y_test)
#print(acc)
## Cross validation
#score = bayes_model.cross_validation(X_train,Y_train, 5)
#print(score)
## Compare accuracy of naive Bayes and logistic regression
# All datasets will use for logistic regression the same learning rate = 0.01 and # iterations = 500
rate = 0.01
iterations = 500
log_model = log_regression(rate, iterations)
X = log_model.bias(X) # add bias column
# Separate training and testing sets
X_train, Y_train, X_test, Y_test = separate.separate(X,Y)
## Logistic regression
# train the data
fit_iono = log_model.fit(X_train,Y_train)
# Cross validation
validation = cross_validation(rate, max_iterations = 500)
score = validation.evaluate_log(X_train,Y_train)
print("Averaged training accuracy for Logistic Regression: ", score)
# Test data
pre = log_model.predict(X_test,fit_iono)
acc = log_model.evaluate_acc(pre,Y_test)
print("Accuracy on testing data for Logistic Regression: ", acc)
## Naive BayesS
# train the data
bayes_model = NaiveBayes()
fit_bayes = bayes_model.fit(X_train,Y_train)
# Cross validation
score = bayes_model.cross_validation(X_train,Y_train)
print("Averaged training accuracy for Naive Bayes: ", score)
# Test data
pre = bayes_model.predict(X_test)
acc = bayes_model.evaluate_acc(pre,Y_test)
print("Accuracy on testing data for Naive Bayes: ", acc)
print()
## Test different learning rates for gradient descent
# Loss function threshold = 5*10^-5; maximum number of iterations = 1000
acc = []
iters=[]
rate = 10**-15
for i in range(20):
# Cross validation
validation = cross_validation(rate, threshold = True)
score, iterations = validation.evaluate_log(X_train,Y_train)
#print("Averaged training accuracy for Logistic Regression: ", score)
print("rate = ", rate, "; iterations = ", iterations, "; accuracy = ", score)
acc.append(score)
iters.append(iterations)
rate *= 10
rate = 1
plt.scatter(iters, acc)
plt.xlabel("iterations")
plt.ylabel("accurary")
plt.title("the accuracy on train set as a function of iterations of gradient descent")
plt.show()
#size of x and accuracy
acc = []
size = []
split_size = 0.1
for i in range(9):
X_train, Y_train, X_test, Y_test = separate.separate(X,Y, split=split_size)
# Cross validation
validation = cross_validation(rate, threshold = True)
score, iterations = validation.evaluate_log(X_train,Y_train)
#print("Averaged training accuracy for Logistic Regression: ", score)
print("size of X = ", X_train.shape[0], "; iterations = ", iterations, "; accuracy = ", score)
size.append(X_train.shape[0])
acc.append(score)
split_size += 0.1
plt.scatter(size, acc)
plt.xlabel("size of X_train")
plt.ylabel("accurary")
plt.title("the accuracy on train set as a function of size of X on logistic model")
plt.show()
#bayes model
acc = []
size = []
split_size = 0.1
for i in range(9):
X_train, Y_train, X_test, Y_test = separate.separate(X,Y, split=split_size)
# Cross validation
score = bayes_model.cross_validation(X_train,Y_train)
#print("Averaged training accuracy for Logistic Regression: ", score)
print("size of X = ", X_train.shape[0], "; accuracy = ", score)
size.append(X_train.shape[0])
acc.append(score)
split_size += 0.1
plt.xlabel("size of X_train")
plt.ylabel("accurary")
plt.title("the accuracy on train set as a function of size od X on Naive Bayes model")
plt.scatter(size, acc)
plt.show()