-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
394 lines (311 loc) · 14.1 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, LSTM
from keras.layers.embeddings import Embedding
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from keras.callbacks import EarlyStopping
import pandas as pd
import numpy as np
import argparse
import os
from argparse import RawTextHelpFormatter
def load_reviews(file_name):
"""Loads data from CSV file to pandas DataFrame.
# Arguments:
file_name: name or path with name to CSV file.
"""
csv_content = pd.read_csv(file_name, sep='\t', index_col=0).sort_index()
return csv_content
def convert_input_to_x_and_y(input_x, input_y):
"""Converts raw texts from CSV into desired format
for further analysis.
# Arguments:
input_x: column with words.
input_y: column with labels.
"""
# Get rid of apostrophes, comas and double spaces
# TODO: replace with regex, e.g. [\s]+ -> " "
x = input_x.apply(lambda x: x.replace("\'", "").replace(",", "").replace(" ", " ").replace(" ", " ")[1:-1])
x = list(x)
# Convert sentiments to numpy array
y = input_y.values
# Transform categorical variable into vectors:
# 1 := [1 0 0]
# 0 := [0 1 0]
# -1 := [0 0 1]
le = LabelEncoder()
y = le.fit_transform(y)
y = to_categorical(y)
return x, y
def split_x_y(padded_x, y):
"""Splits x and y sets using sci-kit into:
- train sets,
- test sets,
- validation sets.
# Arguments:
padded_x: column with words encoded to integers in form
of vectors with equal lengths.
y: column with output values.
"""
# Test size 20% as a rule of thumb
x_train, x_test, y_train, y_test = train_test_split(padded_x, y, test_size=0.2)
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.2)
return x_train, x_test, x_val, y_train, y_test, y_val
def create_set_with_all_words_from_input(tokenizer):
"""Creates a set with all unique words from input data.
It is necessary to effectively assign embeddings to
these words.
# Arguments:
tokenizer: a tokenizer fit previously on input.
"""
word_index_all = set()
for key, value in tokenizer.word_index.items():
word_index_all.add(key)
return word_index_all
def load_embeddings_from_file(file_name, all_input_words):
"""Loads pre-trained embeddings from specified file.
# Arguments:
file_name: name or path with name to file with embeddings.
all_input_words: set with all words from input data.
"""
embeddings_dict = dict()
# Open the file with specified encoding
with open(file_name, encoding='utf8') as file:
# Assumption: first row is a header, so skip it
next(file)
# Iterate the file over its lines
for line in file:
content = line.split()
word_in_line = content[0]
# Use only embeddings to words from input
if word_in_line not in all_input_words:
continue
# Do not take the word itself, only coefficients
coefs = np.array(content[1:])
# Convert string type to float
coefs = coefs.astype(np.float)
embeddings_dict[word_in_line] = coefs
# Get length of any embedding vector
any_emb_vector = next(iter(embeddings_dict.values()))
emb_vec_len = len(any_emb_vector)
return embeddings_dict, emb_vec_len
def create_matrix_from_embeddings(tokenizer, embeddings,
matrix_size, emb_vec_len):
"""Returns a matrix with embeddings assigned to words from
input data.
Firstly the function creates a matrix with zeros only. This matrix
has as many zeros for all words as long is each embedding
vector. Then it assigns embeddings to specific vectors. Their
positions are taken from tokenizer, which had been previously fit.
# Arguments:
tokenizer: a tokenizer fit previously on input.
embeddings: dictionary with all embeddings
matrix_size: number of words from input data
emb_vec_len: length of embedding vector
"""
embedding_matrix = np.zeros((matrix_size, emb_vec_len))
for word, i in tokenizer.word_index.items():
embedding_vector = embeddings.get(word)
if embedding_vector is not None:
embedding_matrix[i] = embedding_vector
return embedding_matrix
def create_model(model, embedding_matrix, rnn_dim, emb_size, max_emb_input_len, emb_vec_len):
"""Adds layers to the model.
# Arguments:
model: model to add layers onto.
rnn_dim: dimensionality of LSTM cell.
emb_size: number of words from input data
max_emb_input_len: maximum length of input vector
emb_vec_len: length of embedding vector
# Layers:
Embedding: non-trainable layer with embedding vectors and weights
LSTM: Long-Short Term Memory layer as a main deep learning layer
Dense: regular densely-connected NN layer.
"""
# Embedding layer
model.add(Embedding(emb_size, output_dim=emb_vec_len,
weights=[embedding_matrix],
input_length=max_emb_input_len,
trainable=False))
# LSTM layer with dropout and recurrent dropout defined
model.add(LSTM(rnn_dim, dropout=0.5,
recurrent_dropout=0.5))
# Dense layer with Softmax activation function
model.add(Dense(3, activation='softmax'))
# Use Adam optimizer and Categorical Crossentropy for compilation
model.compile(optimizer='adam',
loss='categorical_crossentropy',
metrics=['accuracy'])
# Print summary table
model.summary()
def fit_model(model, X_train, X_val, y_train, y_val,
epochs, batch_size, verbose_level):
"""Start training the model with appropriate parameters.
# Arguments:
model: model to be trained.
X_train, X_val, y_train, y_val: training and validation sets.
epochs: number of training epochs (default: 10).
batch_size: size of a single batch (default: 32).
verbose_level: Verbosity mode. 0 = silent,
1 = progress bar, 2 = one line per epoch.
"""
# Fit the model with specified arguments.
# Use EarlyStopping if accuracy improvement is not sufficient
model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size,
callbacks=[EarlyStopping(monitor='acc',
min_delta=0.0001)],
shuffle=True,
validation_data=(X_val, y_val),
verbose=verbose_level)
def evaluate_model(model, X_val, y_val, verbose_level):
"""Model post-assessment.
# Arguments:
model: trained model to be evaluated.
X_val, y_val: sets for validation.
verbose_level: Verbosity mode. 0 = silent,
1 = progress bar, 2 = one line per epoch.
"""
score = model.evaluate(X_val, y_val, verbose=verbose_level)
print("Test Loss: %.2f%%" % (score[0] * 100))
print("Test Accuracy: %.2f%%" % (score[1] * 100))
def main(data_path, embedding_path, rnn_dim, save_dir,
epochs, batch_size, verbose_level):
"""Main script function, which can be called directly
from the command line. It performs all tasks necessary
to run the script accordingly to the specified points.
# Arguments:
data_path: path to the file with data in the format
identical with example dataset
embedding_path: path to pre-trained embeddings
rnn_dim: number of hidden units in LSTM layer
save_dir: path to directory for trained model saving
epochs: number of training epochs (default: 10).
batch_size: size of a single batch (default: 32).
verbose_level: Verbosity mode. 0 = silent,
1 = progress bar, 2 = one line per epoch.
"""
print("Parameters:")
print("\tData path:", data_path)
print("\tEmbedding parh:", embedding_path)
print("\tRNN dimensions:", rnn_dim)
print("\tSaving directory:", save_dir)
print("\tEpochs:", epochs)
print("\tBatch size:", batch_size)
print("\tVerbose level:", verbose_level)
# Load CSV file with reviews
print("Loading CSV file...")
df = load_reviews(data_path)
print("CSV file loaded.")
# Process X and y in terms of format and specific chars usage
X, y = convert_input_to_x_and_y(df['tokens'], df['sentiment'])
# Define tokenizer, fit it on input data
# and convert this data to sequences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)
encoded_X = tokenizer.texts_to_sequences(X)
#TODO: Normalize tokenized vectors (to values 0-1).
print("All words tokenized.")
# Define vocabulary size, i.e. number of all words.
# '+ 1', because '0' index is reserved and cannot be used
vocab_size = len(tokenizer.word_index) + 1
print('Found %s unique tokens.' % len(tokenizer.word_index))
# Get max length of encoded_X vector for calculating padding space
max_length = len(max(encoded_X, key=len))
padded_x = pad_sequences(encoded_X, maxlen=max_length, padding='post')
# Create set with all unique words from input data
word_index_all = create_set_with_all_words_from_input(tokenizer)
# Split input data into six sets
x_train, x_test, x_val, y_train, y_test, y_val = split_x_y(padded_x, y)
print("Words split into sets:")
print("\tx_train shape: " + str(x_train.shape))
print("\tx_test shape: " + str(x_test.shape))
print("\tx_val shape: " + str(x_val.shape))
print("\ty_train shape: " + str(y_train.shape))
print("\ty_test shape: " + str(y_test.shape))
print("\ty_val shape: " + str(y_val.shape))
print("Loading embeddings file...")
embeddings, emb_vec_len = load_embeddings_from_file(embedding_path, word_index_all)
print("Embeddings file loaded and processed.")
print('Found %s word vectors.' % len(embeddings))
# Create embedding matrix
embedding_matrix = create_matrix_from_embeddings(tokenizer, embeddings,
matrix_size=vocab_size,
emb_vec_len=emb_vec_len)
# Define and process the model
sequential = Sequential()
create_model(sequential, embedding_matrix,
rnn_dim=rnn_dim, emb_size=vocab_size,
max_emb_input_len=max_length,
emb_vec_len=emb_vec_len)
# Model fitting
print("Model training...")
fit_model(sequential, x_train, x_val, y_train, y_val,
epochs=epochs, batch_size=batch_size,
verbose_level=verbose_level)
print("Model trained.")
# Model evaluation
print("Evaluating the model...")
evaluate_model(sequential, x_val, y_val,
verbose_level=verbose_level)
# Model saving
save_path = save_dir + '\sentiment_model.h5'
sequential.save(save_path)
print("Model saved to: ", save_path)
def dir_path(path):
"""Function for proper directory definition. Used only for argparse."""
if os.path.isdir(path):
return path
else:
raise argparse.ArgumentTypeError(f"readable_dir:{path} is not a valid path.")
def create_parser():
"""Function for command line support."""
parser = argparse.ArgumentParser(description="Script for Deep Learning task, satisfying following:\n"
"Your task is to build python script for text "
"classifier training for Polish language using "
"simple recurrent neural network with LSTM cell.\n\n"
"Deep learning implemented with Keras API.\n\n",
epilog="Script written by Rafal Klat.\n"
"\tmail: [email protected]\n"
"\tphone: +48 664 495 049",
formatter_class=RawTextHelpFormatter)
parser.add_argument('-d', '--data_path',
help='Path to the file with input data.',
required=True, type=argparse.FileType('r'))
parser.add_argument('-e', '--embedding_path',
help='Path to the file with pre-trained embeddings.',
required=True, type=argparse.FileType('r'))
parser.add_argument('-r', '--rnn_dim',
help='Number of hidden units in LSTM layer.',
required=True, type=int)
parser.add_argument('-s', '--save_dir',
help='Path to a saving of trained model.',
required=True, type=dir_path)
parser.add_argument('-E', '--epochs',
help='Number of training epochs (optional).\n(default: 10)',
required=False, type=int, default=10)
parser.add_argument('-b', '--batch_size',
help='Size of a single batch (optional).\n(default: 32)',
required=False, type=int, default=32)
parser.add_argument('-v', '--verbose_level',
help='Verbosity mode.\n0 = silent, 1 = progress bar, 2 = one line '
'per epoch (optional).\n(default: 0)',
required=False,
type=int, choices=[0, 1, 2], default=0)
return parser
if __name__ == '__main__':
parser = create_parser()
args = vars(parser.parse_args())
data_path = args["data_path"].name
embedding_path = args["embedding_path"].name
rnn_dim = args["rnn_dim"]
save_dir = args["save_dir"]
epochs = args['epochs']
batch_size = args['batch_size']
verbose_level = args['verbose_level']
print("-" * 65)
print("Starting SENTIMENTAL PYTHON SCRIPT WITH Keras LSTM by Rafal Klat.\n")
print("-" * 65)
main(data_path, embedding_path, rnn_dim, save_dir, epochs, batch_size, verbose_level)