-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathautoencoder.py
170 lines (133 loc) · 5.73 KB
/
autoencoder.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
import random
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn import datasets
from sklearn.utils import shuffle
# Implementation of a feedforward neural network(MLP) to use it as an autoencoder for word embeddings
# Requirements: TensorFlow, Pandas, Sklearn, Numpy
# set random seed value
RANDOM_SEED = 50
tf.set_random_seed(RANDOM_SEED)
def read_csv_file(emb_path):
"""read csv file
"""
emb_set = pd.read_csv(emb_path, header=0, encoding='utf-8')
return emb_set
def init_weights(shape):
"""weight initialization
Note: use random value from normal distribution
"""
weights = tf.random_normal(shape, stddev = 0.1)
return tf.Variable(weights)
def forward(X, w_1, w_2, b_1, b_2):
"""forward-propagation
Note: propagate inputs X through network, return output from hidden layer as "new synthesized word embedding"
"""
h = tf.add(tf.matmul(X, w_1), b_1) # h: hidden layer (h = x * w_1 + b_1)
y_predict = tf.add(tf.matmul(h, w_2), b_2) # y_predict: estimated output (y_hat = h * w_2 + b_2)
return h, y_predict
def split_data(data):
"""reading & splitting data
Note: split data into training(0.9) and test set(0.1)
"""
train = data.sample(frac=0.9, random_state=RANDOM_SEED)
test = data.drop(train.index)
return train, test
def main():
# load pre-processed word embeddings
input_emb = read_csv_file('data/input.csv') #(19928, 101)
# split data into training & testing sets
train, test = split_data(input_emb) # train: (17936,201)
train_X = train
train_y = train
# split testing sets into validation & testing sets
validation = test.sample(frac=0.5, random_state=RANDOM_SEED) # validation: (996, 201)
test = test.drop(validation.index) # test: (996, 201)
# set batch size
batch = 1000 # number of words in each batch
model_path = "model/model.ckpt" # path to save model
# set hidden layer size
x_size = train_X.shape[1] # number of input nodes
h_size = 1000 # number of hidden nodes(dimension)
y_size = train_y.shape[1] # number of outcomes
# set tf graph input
X = tf.placeholder("float", shape=[None, x_size])
y = tf.placeholder("float", shape=[None, y_size])
# set model weights
w_1 = init_weights((x_size, h_size))
w_2 = init_weights((h_size, y_size))
# set bias
b_1 = tf.Variable(tf.ones([h_size]))
b_2 = tf.Variable(tf.ones([y_size]))
# forward propagation
hidden_output, y_predict = forward(X, w_1, w_2, b_1, b_2) # hidden_output: synthesized word embedding
# backward propagation
error = tf.subtract(y, y_predict)
mse = tf.reduce_mean(tf.square(error))
update = tf.train.GradientDescentOptimizer(0.001).minimize(mse) # SGD step size: 0.001
# create 'saver' op to save all variables
saver = tf.train.Saver()
# run stochastic gradient descent
sess = tf.Session()
init = tf.global_variables_initializer()
with tf.Session() as sess:
sess.run(init)
tf.train.write_graph(sess.graph, "model/",'graph.pbtxt')
for epoch in range(100):
# initialize variables
total_mse = total_mse_test = total_mse_valid = 0
train = shuffle(train)
test = shuffle(test)
train_X = train_y = train
test_X = test_y = test
validation_X = validation_y = validation
last_train_batch = train_X.shape[0] % batch
last_test_batch = test_X.shape[0] % batch
last_validation_batch = validation_X.shape[0] % batch
# compute training accuracy (average of L2 norm)
for i in range(0, len(train_X), batch):
if (i + batch < len(train_X)):
batch_mse, __ = sess.run([mse,update], feed_dict={X: train_X[i: i+batch], y: train_y[i: i+batch]})
i = i + batch
else: # last batch
batch_mse, __ = sess.run([mse,update], feed_dict={X: train_X[i:i+last_train_batch], y: train_y[i:i+last_train_batch]})
total_mse = total_mse + batch_mse * batch
train_accuracy = total_mse / len(train_X)
# compute testing accuracy (average of L2 norm)
for i in range(0, len(test_X), batch):
if (i + batch < len(train_X)):
batch_mse = sess.run(mse, feed_dict={X: test_X[i: i+batch], y: test_y[i: i+batch]})
i = i + batch
else:
batch_mse = sess.run(mse, feed_dict={X: test_X[i: i+last_test_batch], y: test_y[i: i+last_test_batch]})
total_mse_test = total_mse_test + batch_mse * batch
test_accuracy = total_mse_test / len(test_X)
# compute validation accuracy (average of L2 norm)
for i in range(0, len(validation_X), batch):
if (i + batch < len(train_X)):
batch_mse = sess.run(mse, feed_dict={X: validation_X[i: i+batch], y: validation_y[i: i+batch]})
i = i + batch
else:
batch_mse = sess.run(mse, feed_dict={X: validation_X[i: i+last_validation_batch], y: validation_y[i: i+last_validation_batch]})
total_mse_valid = total_mse_valid + batch_mse * batch
validation_accuracy = total_mse_valid / len(validation_X)
print("Epoch = %d, train average MSE = %.2f%%, test average MSE = %.2f%%, validation average MSE = %.2f%%" % (epoch + 1, train_accuracy, test_accuracy, validation_accuracy))
# save model
save_path = saver.save(sess, model_path)
print("Model saved in file: %s" % save_path)
# write file for GraphVisualizer
file_writer = tf.summary.FileWriter('model/', sess.graph)
# get output of hidden layer node("Variable_1") from TensorFlow session
layer1_output = tf.get_default_graph().get_tensor_by_name("Variable_1:0")
# save output as a new synthesized word embedding set
syn_embed = sess.run(layer1_output) #(h_size,199)
# close TensorFlow session
sess.close()
# write CSV file for a new embedding set
print("New embedding saved with size of %s at %s" % (str(syn_embed.shape), 'data/syn_embed.csv'))
pd.DataFrame(syn_embed).to_csv('data/syn_embed.csv', encoding='utf-8')
# print instruction to launch Tensorflow
print("Run the Tensorflow with the command: %s" % ("tensorboard --logdir data/"))
if __name__ == '__main__':
main()