-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnaive_bayes.py
175 lines (132 loc) · 5.3 KB
/
naive_bayes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
'''naive_bayes_multinomial.py
Naive Bayes classifier with Multinomial likelihood for discrete features
'''
import numpy as np
class NaiveBayes:
'''Naive Bayes classifier using Multinomial likeilihoods'''
def __init__(self, num_classes):
'''Naive Bayes constructor
'''
self.num_classes = num_classes
self.class_priors = None
self.class_likelihoods = None
def get_priors(self):
'''Returns the class priors'''
return self.class_priors
def get_likelihoods(self):
'''Returns the class likelihoods'''
return self.class_likelihoods
def train(self, data, y):
'''Train the Naive Bayes classifier so that it records the "statistics" of the training set
Parameters:
-----------
data: ndarray. shape=(num_samps, num_features).
y: ndarray. shape=(num_samps,).
'''
num_features = data.shape[1]
self.class_likelihoods = np.zeros((self.num_classes, num_features))
self.class_priors = np.zeros(self.num_classes)
for i in range(self.num_classes):
matches = np.where(y == i, 1, 0)
self.class_priors[i] = np.sum(matches)/y.shape[0]
for j in range(data.shape[1]):
idx = np.where(y == i)
class_words = np.sum(data[:, j][idx])
total_words = np.sum(data[idx, :])
likelihoods = (class_words + 1)/(total_words + data.shape[1])
self.class_likelihoods[i, j] = likelihoods
def predict(self, data):
'''Combine the class likelihoods and priors to compute the posterior distribution.
Parameters:
-----------
data: ndarray. shape=(num_test_samps, num_features).
Returns:
-----------
ndarray of nonnegative ints. shape=(num_samps,). Predicted class of each test data sample.
'''
log_prior = np.log(self.class_priors)
log_posterior = log_prior + data @ np.log(self.class_likelihoods).T
classes = np.argmax(log_posterior, axis = 1)
return classes
def accuracy(self, y, y_pred):
'''Computes accuracy based on percent correct: Proportion of predicted class labels `y_pred`
that match the true values `y`.
Parameters:
-----------
y: ndarray. shape=(num_data_sams,)
Ground-truth, known class labels for each data sample
y_pred: ndarray. shape=(num_data_sams,)
Predicted class labels by the model for each data sample
Returns:
-----------
float. Between 0 and 1. Proportion correct classification.
'''
pc = np.sum(np.where(y == y_pred, 1, 0))/y.shape[0] * 100
return pc
def confusion_matrix(self, y, y_pred):
'''Create a confusion matrix based on the ground truth class labels (`y`) and those predicted
by the classifier (`y_pred`).
Parameters:
-----------
y: ndarray. shape=(num_data_samps,)
Ground-truth, known class labels for each data sample
y_pred: ndarray. shape=(num_data_samps,)
Predicted class labels by the model for each data sample
Returns:
-----------
ndarray. shape=(num_classes, num_classes).
Confusion matrix
'''
matrix = np.zeros((self.num_classes, self.num_classes))
for i in range(self.num_classes):
for j in range(self.num_classes):
act_match = np.where(y == i, 1, 0)
pred_match = np.where(y_pred == j, 1, 0)
matches = np.logical_and(act_match, pred_match)
matrix[i, j] = np.sum(np.where(matches == True, 1, 0))
return matrix
def kfold(self, data, labels, k):
'''Perform k-fold cross validation on the data and labels. Returns an array of accuracies
Parameters:
-----------
data: ndarray. shape=(num_data_samps, num_features).
labels: ndarray. shape=(num_data_samps,).
k: int. Number of folds to use in cross validation
Returns:
-----------
accuracies: ndarray. shape=(k,). Array of accuracies for each fold
'''
inds = np.arange(labels.size)
# shuffle data
features = data.copy()
y = labels.copy()
inds = np.arange(y.size)
np.random.shuffle(inds)
features = features[inds]
y = y[inds]
accuracies = np.zeros(k)
# start folds
start = 0
fold = y.size//k
for i in range(k):
end = start + fold
# test fold
x_test = features[start:end, :]
y_test = y[start:end]
# before kth fold
x_before = features[0:start, :]
y_before = y[0:start]
# after kth fold
x_after = features[end:, :]
y_after = y[end:]
# combine into training
x_train = np.vstack((x_before, x_after))
y_train = np.hstack((y_before, y_after))
# print("train", x_train.shape)
# train and eval
self.train(x_train, y_train)
y_pred = self.predict(x_test)
acc = self.accuracy(y_test, y_pred)
accuracies[i] = acc
start += fold
return accuracies