-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathpmm.py
336 lines (291 loc) · 11 KB
/
pmm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
import numpy as np
from random import random, uniform, seed
from utils import logpoisson_vec_all
import matplotlib.pyplot as plt
class PoissonMixture():
def __init__(self, K, max_iter=200, eps=0.0001,
gamma_prior=True, g_a=1.1, g_b=0.1, rseed=321, verbose=1):
"""
Parameters
----------
K: int
Number of mixture components.
max_iter: int
Maximum number of EM iterations. The algorithm stops when either
the number of iteration reaches `max_iter` or the change in log likelihood
becomes less than `eps`.
eps: float
Tolerance with regards to the log likelihood of the model
to declare convergence. The algorithm stops when either
the number of iteration reaches `max_iter` or the change in log likelihood
becomes less than `eps`.
gamma_prior: bool
Add gamma prior to the model. Default is True.
g_a: float
Hyperparameter for the gamma prior. Used if `gamma_prior` is True.
g_b: float
Hyperparameter for the gamma prior. Used if `gamma_prior` is True.
rseed: int
Random seed
verbose: int
The level of verbosity. Integer value in [0, 2]
"""
self.K = K
self.max_iter = max_iter
self.eps = eps
self.gamma_prior = gamma_prior
self.g_a = g_a
self.g_b = g_b
self.rseed = rseed
self.verbose = verbose
def calculate_loglik(self, Y, lam, pi):
"""
Calculates total log likelihood of the model.
Parameters
----------
Y: np.array of size (N, D)
Number of counts per day for each student
lam: np.array of size (K, D)
Component means
pi: np.array of size (K,)
Mixture weights
* N: number of students
D: number of days in week
K: number of mixture components
Returns
-------
float
"""
N = Y.shape[0]
p_y_z = np.zeros((N, self.K))
for k in range(self.K):
p_y_z[:, k] = np.exp(logpoisson_vec_all(lam[k], Y, self.gamma_prior,
self.g_a, self.g_b))
p_y = np.dot(p_y_z, pi)
return np.sum(np.log(p_y))
def _e_step(self, Y, lam, pi):
"""
E-step of EM algorithm for Poisson mixture model.
Calculates membership weights for each student.
Parameters
----------
Y: np.array of size (N, D)
Number of counts per day for each student
lam: np.array of size (K, D)
Component means
pi: np.array of size (K,)
Mixture weights
* N: number of students
D: number of days in week
K: number of mixture components
Returns
-------
np.array of size (N, K)
calculated membership weights
"""
N = Y.shape[0]
p_y_z = np.zeros((N, self.K))
mw = np.zeros((N, self.K))
for k in range(self.K):
p_y_z[:, k] = np.exp(logpoisson_vec_all(lam[k], Y, self.gamma_prior,
self.g_a, self.g_b))
mw[:, k] = p_y_z[:, k] * pi[k]
denom = np.tile(np.sum(mw, axis=1), (self.K, 1)).T
mw /= denom
return mw
def _m_step(self, Y, mw, fix_one_lambda_flat):
"""
M-step of EM algorithm for Poisson mixture model.
Does MLE or MAP estimation (depending on whether the model has Gamma prior or not)
for parameters pi (mixture weights) and lambdas (component means).
Parameters
----------
Y: np.array of size (N, D)
Number of counts per day for each student
mw: np.array of size (N, K)
Membership weight for each student
fix_one_lambda_flat: bool
If True, one of the mixture component will be forced to have a very
flat lambda vector as its mean.
Default is set to False.
* N: number of students
D: number of days in week
K: number of mixture components
Returns
-------
tuple[np.array, np.array]
"""
N = Y.shape[0]
D = Y.shape[1]
Nk = np.sum(mw, axis=0)
pi = Nk / float(N)
if self.gamma_prior:
lam = np.dot(mw.T, (Y + self.g_a - 1)) / ((1 + self.g_b) * Nk[:, None])
else:
lam = np.dot(mw.T, Y) / (Nk[:, None] + 1e-7) # add a very small number to avoid division by 0
if fix_one_lambda_flat:
lam[0, 1:] = np.sum(lam[0, 1:]) / (D - 1)
return lam, pi
def fit(self, Y, fix_one_lambda_flat=False, return_obj=True):
"""
Fit the Poisson mixture model with data matrix `Y`.
Parameters
----------
Y: np.array of size (N, D)
Number of counts per day matrix for each student
Each row is a student's daily activity count vector.
fix_one_lambda_flat: bool
If True, one of the mixture component will be forced to have a very
flat lambda vector as its mean.
Default is set to False.
return_obj: bool
If True, returns result as an object of class PoisMixResult.
If False, returns a list of results.
* N: number of students
D: number of days in week
K: number of mixture components
Returns
-------
PoisMixResult
or tuple[np.array, np.array, np.array, float] that corresponds to [lam, mw, pi, loglik]
"""
def print_params(lam, pi, loglik):
print("lam", lam)
print("pi", pi)
print("loglik", loglik)
print("")
Y = np.array(Y)
N = Y.shape[0]
K = self.K
# mixture weights
pi = np.ones(K) / float(K)
# membership weights
mw = np.zeros((N, K))
# Initialize log likelihood
loglik = 0.0
# yi is a vector
if (len(Y.shape) == 2) and (Y.shape[1] > 1):
# Days
D = Y.shape[1]
# Initialize lambdas
lam = np.zeros((K, D))
seed(self.rseed)
for k in range(K):
lam[k] = np.ones((1, D)) * uniform(0.5, 3)
for i in range(self.max_iter):
prev_ll = loglik
loglik = self.calculate_loglik(Y, lam, pi)
if abs(prev_ll-loglik) < self.eps:
break
if self.verbose > 0:
if i % 20 == 0:
print("----- number of iterations: " + str(i))
print(" loglik: %.3f\n" % loglik)
if self.verbose > 2:
print_params(lam, pi, loglik)
# Estep
mw = self._e_step(Y, lam, pi)
# Mstep
lam, pi = self._m_step(Y, mw, fix_one_lambda_flat)
# calculate final log lik
loglik = self.calculate_loglik(Y, lam, pi)
if self.verbose > 0:
print("----- number of iterations: " + str(i))
print(" loglik: %.3f\n" % loglik)
if return_obj:
return PoisMixResult(lam, mw=mw, pi=pi, loglik=loglik,
gamma_prior=self.gamma_prior, g_a=self.g_a, g_b=self.g_b)
else:
return lam, mw, pi, loglik
else:
print("ERROR: Y should be a 2 dimensional array. Exiting..")
return
class PoisMixResult():
"""
A class that stores all the results from Poisson mixture model.
Variables
---------
assignments: np.array of size N
List of mixture group indices that each student belongs to
(when thresholded at membetship weight = 0.5)
group_counts: dict[int:int]
Dictionary that shows the number of students assigned to each group index.
Key: group/component index, Value: Number of students
* Most variables have the same format as in the class PoissonMixture. (lam, mw, pi, loglik)
"""
def __init__(self, lam, mw=None, pi=None, loglik=0.0,
gamma_prior=False, g_a=2.0, g_b=2.0):
"""
Parameters
----------
lam: np.array with size (K, D)
Mixture component means
mw: np.array with size (N, K)
membership weight of each student and each mixture component
pi: np.array with size (K, )
mixture weight for each component
loglik: float
log likelihood of the final model
gamma_prior: bool
True if the model has gamma prior
g_a: float
Hyperparameter of gamma prior a
g_b: float
Hyperparameter of gamma prior b
"""
self.K = lam.shape[0]
self.gamma_prior = gamma_prior
self.g_a = g_a
self.g_b = g_b
self.lam = lam
self.mw = mw
self.pi = pi
self.loglik = loglik
if mw is not None:
self.N = mw.shape[0]
self.assignments = np.argmax(self.mw, axis=1)
from collections import Counter
self.group_counts = dict(Counter(self.assignments))
def plot_lambdas(self, xticklabels=None, divide_by_week=False, colors=None,
fontsize=15, ticklab_size=15,
save_fig=False, file_prefix='./lambda'):
"""
Plot lambdas (component means) into pdf files.
Parameters
----------
xticklabels
divide_by_week
colors
save_fig
file_prefix
Returns
-------
"""
print('Plotting lambdas')
if colors == None:
colors = ['blue'] * self.K
maxlam = int(np.max(self.lam) + 1.0)
n_lam = self.lam.shape[1]
for k in range(self.K):
fig, ax = plt.subplots(figsize=(6, 4))
if divide_by_week:
ax.bar(range(n_lam), self.lam[k] / 5.0, align='center', color=colors[k], linewidth=0)
ax.set_ylabel('AVG NUMBER OF TASKS\nON EACH DAY OF WEEK', fontsize=fontsize)
else:
ax.bar(range(n_lam), self.lam[k], align='center', color=colors[k], linewidth=0)
ax.set_ylabel('TOTAL NUMBER OF TASKS\nON EACH DAY OF WEEK', fontsize=fontsize)
ax.grid(alpha=0.2)
ax.set_ylim(0, maxlam)
ax.set_yticks(range(maxlam + 1))
ax.set_xlim(-0.5, n_lam - 0.5)
ax.set_xticks(range(n_lam))
if xticklabels is None:
if n_lam == 6:
ax.set_xticklabels(['SS', 'M', 'T', 'W', 'T', 'F'], fontsize=ticklab_size)
elif n_lam == 7:
ax.set_xticklabels(['M', 'T', 'W', 'T', 'F', 'S', 'S'], fontsize=ticklab_size)
else:
ax.set_xticklabels(xticklabels)
if save_fig:
fname = '_'.join([file_prefix, str(self.K), str(k) + '.pdf'])
fig.savefig(fname)