-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathlda.py
180 lines (143 loc) · 5.91 KB
/
lda.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
import numpy as np
import re, string
from scipy.special import gammaln, psi
meanchangethresh = 0.001
def dirichlet_expectation(alpha):
if(len(alpha.shape) == 1):
return(psi(alpha)-psi(np.sum(alpha)))
return (psi(alpha)-psi(np.sum(alpha, 1))[:,np.newaxis])
def parse_doc_list(docs, vocab):
"""
Parse a document into a list of word ids and a list of counts,
or parse a set of documents into two lists of lists of word ids
and counts.
Arguments:
docs: List of D documents. Each document must be represented as
a single string. (Word order is unimportant.) Any
words not in the vocabulary will be ignored.
vocab: Dictionary mapping from words to integer ids.
Returns a pair of lists of lists.
The first, wordids, says what vocabulary tokens are present in
each document. wordids[i][j] gives the jth unique token present in
document i. (Don't count on these tokens being in any particular
order.)
The second, wordcts, says how many times each vocabulary token is
present. wordcts[i][j] is the number of times that the token given
by wordids[i][j] appears in document i.
"""
if (type(docs).__name__ == 'str'):
temp = list()
temp.append(docs)
docs = temp
D = len(docs)
wordids = list()
wordcts = list()
for d in range(D):
docs[d] = docs[d].lower()
docs[d] = re.sub(r'-', ' ', docs[d])
docs[d] = re.sub(r'[^a-z ]', '', docs[d])
docs[d] = re.sub(r' +', ' ', docs[d])
words = string.split(docs[d])
ddict = dict()
for word in words:
if (word in vocab):
wordtoken = vocab[word]
if (not wordtoken in ddict):
ddict[wordtoken] = 0
ddict[wordtoken] += 1
wordids.append(ddict.keys())
wordcts.append(ddict.values())
return((wordids, wordcts))
class LDA:
"""
Latent Dirichlet allocation with mean field variational inference
"""
def __init__(self, vocab, K, wordids, wordcts, alpha, eta):
self._vocab = dict()
for word in vocab:
#word = word.lower()
#word = re.sub(r'[^a-z]', '', word)
self._vocab[word] = len(self._vocab)
self._W = len(vocab)
self._K = K
self._D = len(wordids)
self._alpha = alpha
self._eta = eta
self._wordids = wordids
self._wordcts = wordcts
self._lambda = 1*np.random.gamma(100., 1./100, (self._K, self._W))
self._Elogbeta = dirichlet_expectation(self._lambda)
self._expElogbeta = np.exp(self._Elogbeta)
def do_e_step(self):
"""
compute approximate topic distribution of each document and each word
"""
#random initialize gamma
gamma = 1*np.random.gamma(100.,1./100, (self._D, self._K))
Elogtheta = dirichlet_expectation(gamma)
expElogtheta = np.exp(Elogtheta)
#sufficient statistics to update lambda
sstats = np.zeros(self._lambda.shape)
for d in range(0, self._D):
ids = self._wordids[d]
cts = np.array(self._wordcts[d])
gammad = gamma[d,:]
Elogthetad = Elogtheta[d,:]
expElogthetad = expElogtheta[d,:]
expElogbetad = self._expElogbeta[:,ids]
phinorm = np.dot(expElogthetad, expElogbetad) + 1e-100
for it in xrange(100):
lastgamma = gammad
gammad = self._alpha + expElogthetad * \
np.dot(cts / phinorm, expElogbetad.T)
Elogthetad = dirichlet_expectation(gammad)
expElogthetad = np.exp(Elogthetad)
phinorm = np.dot(expElogthetad, expElogbetad) + 1e-100
meanchange = np.mean(abs(gammad - lastgamma))
if(meanchange < meanchangethresh):
break
gamma[d,:] = gammad
sstats[:, ids] += np.outer(expElogthetad.T, cts/phinorm)
# This step finishes computing the sufficient statistics for the
# M step, so that
# sstats[k, w] = \sum_d n_{dw} * phi_{dwk}
# = \sum_d n_{dw} * exp{Elogtheta_{dk} + Elogbeta_{kw}} / phinorm_{dw}.
sstats = sstats * self._expElogbeta
return (gamma,sstats)
def do_m_step(self):
"""
estimate topic distribution based on computed approx. topic distribution
"""
(gamma, sstats) = self.do_e_step()
self._lambda = self._eta + sstats
self._Elogbeta = dirichlet_expectation(self._lambda)
self._expElogbeta = np.exp(self._Elogbeta)
bound = self.approx_bound(gamma)
return (gamma,bound)
def approx_bound(self, gamma):
"""
Compute lower bound of the corpus
"""
score = 0
Elogtheta = dirichlet_expectation(gamma)
expElogtheta = np.exp(Elogtheta)
# E[log p(docs | theta, beta)]
for d in range(0, self._D):
gammad = gamma[d, :]
ids = self._wordids[d]
cts = np.array(self._wordcts[d])
phinorm = np.zeros(len(ids))
for i in range(0, len(ids)):
temp = Elogtheta[d, :] + self._Elogbeta[:, ids[i]]
tmax = max(temp)
phinorm[i] = np.log(sum(np.exp(temp - tmax))) + tmax
score += np.sum(cts * phinorm)
# E[log p(theta | alpha) - log q(theta | gamma)]
score += np.sum((self._alpha - gamma)*Elogtheta)
score += np.sum(gammaln(gamma) - gammaln(self._alpha))
score += sum(gammaln(self._alpha*self._K) - gammaln(np.sum(gamma, 1)))
# E[log p(beta | eta) - log q (beta | lambda)]
score = score + np.sum((self._eta-self._lambda)*self._Elogbeta)
score = score + np.sum(gammaln(self._lambda) - gammaln(self._eta))
score = score + np.sum(gammaln(self._eta*self._W) - gammaln(np.sum(self._lambda, 1)))
return(score)