-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathem_gmm_130916.py
139 lines (113 loc) · 4.91 KB
/
em_gmm_130916.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
# http://statweb.stanford.edu/~tibs/stat315a/LECTURES/em.pdf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt;
from sklearn.preprocessing import StandardScaler
from math import pi
from math import log
from sklearn.mixture import GMM
def preprocess_data(X, scaler=None):
if not scaler:
scaler = StandardScaler()# http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html
scaler.fit(X)
X = scaler.transform(X)
return X, scaler
def get_univ_normal_loglik(data, mean, var):
'''
Parameters:
-------------------------
data -
mean -
var -
'''
if len(data.shape) >1: # input data is multidimensional
raise Exception("This function is valid for univariate data only.")
else:
data = pd.Series(data) # make it a Series if it's not so we can sum
loglik = -0.5*log(pi) - 0.5*log(var) -1/(2*var) *sum( (data-mean)**2 )
return loglik
def gmm_expectation(data, means, vars, priors):
'''
Parameters:
-------------------------
data -
means - vector fo means for k gaussians
vars - vector of vars for k gaussians
pi - vector of priors for k gaussians
Returns the weights (responsibilities) for each observation.
'''
k = len(priors) # how many gaussians
N = len(data) # how many observations
log_obs = np.zeros(N) # loglik of all individual observations
lik_obs_i_for_mixt = np.zeros((N,k))
for i in xrange(N):
# loop through gaussians to find log_lik for each and then calc weights / responsibilities
for j in xrange( k ):
lik_obs_i_for_mixt[i,j] = get_univ_normal_loglik(data.iloc[i], means[j], vars[j])
weights = np.array(map(lambda x: (np.exp(x) * priors) / (np.exp(x)*priors).sum(),
lik_obs_i_for_mixt )
) #a.k.a. responsibilities
#log_obs[i] = (lik_obs_i_for_mixt * priors).sum()
#log_lik = log_obs.sum() # loglik of the entire data as a sum of indiv observations
return weights
def gmm_loglik(data, means, vars, priors):
'''
Parameters:
-------------------------
data -
means - vector fo means for k gaussians
vars - vector of vars for k gaussians
pi - vector of priors for k gaussians
'''
k = len(priors) # how many gaussians
N = len(data) # how many observations
# loop through gaussians to find log_lik for each and then sum
lik_obs_i_for_mixt = np.zeros(k)
for k in xrange( k ):
lik_obs_i_for_mixt[k] = get_univ_normal_loglik(data, means[k], vars[k])
# loglik is equal to the sum of all k mixture components
log_lik =(lik_obs_i_for_mixt * priors).sum()
return log_lik
if __name__ == "__main__":
visualise = 0
old_faithf_dat= "Old_faithful_data.csv"
data = pd.read_csv(old_faithf_dat)
data_scaled, scaler= preprocess_data(data)
data_scaled_df = pd.DataFrame(data_scaled, columns=data.columns.tolist())
if visualise:
plt.scatter(data_scaled_df.eruptions, data_scaled_df.waiting)
plt.show()
# Univariate #
if visualise:
plt.hist(data.eruptions)
plt.show()
# 1. Do Manual EM
num_iter = 10
num_mixtures= 2 # how many gaussians
N = len(data) # number of observations
# parameter arrays
means = np.zeros((num_iter+1,num_mixtures))
vars = np.zeros((num_iter+1,num_mixtures))
priors = np.zeros((num_iter+1,num_mixtures))
# initial values
means[0] = np.array([1, 6])
vars[0] = np.array([0.2, 0.2])
priors[0] = np.array([0.4, 0.6])
for i in xrange(num_iter): # http://statweb.stanford.edu/~tibs/stat315a/LECTURES/em.pdf Slide 13
# calculate expectations (responsibilities)
expectations = gmm_expectation(data.eruptions, means[i], vars[i], priors[i])
# maximise likelihood for all three params given the expectations
means[i+1] = [( (expectations[:,j] * data.eruptions).sum()) / expectations[:,j].sum()
for j in xrange(num_mixtures)]
vars[i+1] = [ (expectations[:,j]*(data.eruptions - means[i+1,j])**2).sum() / expectations[:,j].sum()
for j in xrange(num_mixtures)]
priors[i+1] = [ expectations[:,j].sum() / N for j in xrange(num_mixtures)]
print str(i) + ' |means :' + str(means[i+1].round(2)) + \
' |vars :' + str(vars[i+1].round(2)) + \
'|priors :' + str(priors[i+1].round(2))
# 1. Do ScikitLearn GMM
gmm = GMM(n_components=num_mixtures)
gmm.fit(data.eruptions)
print 'Sklearn finds priors: ' + str( np.round(gmm.weights_, 2) )
print 'Sklearn finds means: ' + str( np.round(gmm.means_, 2 ) )
print 'Sklearn finds vars: ' + str( np.round(gmm.covars_, 2 ) )