-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmodel.py
545 lines (446 loc) · 23.2 KB
/
model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
from vocabulary import PAD_token, SOS_token, EOS_token, Voc
import torch
from torch.jit import script, trace
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import csv
import random
import re
import os
import unicodedata
import codecs
from io import open
import itertools
import math
import pickle
import numpy as np
USE_CUDA = torch.cuda.is_available()
device = torch.device('cuda' if USE_CUDA else 'cpu')
MAX_LENGTH = 15 # Maximun sentence length to consider
#
# # load voc and pairs
def loadDataset():
with open(os.path.join('data', 'voc.pkl'), 'rb') as handle_voc:
voc = pickle.load(handle_voc)
with open(os.path.join('data', 'pairs.pkl'), 'rb') as handle_pairs:
pairs = pickle.load(handle_pairs)
return voc, pairs
voc, pairs = loadDataset()
#
# # add EOS_token
def indexesFromSentence(voc, sentence):
return [voc.word2index[word] for word in sentence.split(' ')] + [EOS_token]
#
# 通过zip_longest(),zeroPadding时矩阵转置
def zeroPadding(l, fillvalue=PAD_token):
return list(itertools.zip_longest(*l, fillvalue=fillvalue))
#
# mask矩阵
def binaryMatrix(l, value=PAD_token):
m = []
for i, seq in enumerate(l):
m.append([])
for token in seq:
if token == PAD_token:
m[i].append(0)
else:
m[i].append(1)
return m
#
# Returns padded input sequence tensor and lengths
def inputVar(l, voc):
indexes_batch = [indexesFromSentence(voc, sentence) for sentence in l] # word2index batch
lengths = torch.tensor([len(indexes) for indexes in indexes_batch])
padList = zeroPadding(indexes_batch)
padVar = torch.LongTensor(padList) # list to longTensor,shape(max_length, batch_size)
# padVar = torch.tensor(padList, dtype=torch.long)
return padVar, lengths
#
# # Returns padded target sequence tensor, padding mask, and max target length
def outputVar(l, voc):
indexes_batch = [indexesFromSentence(voc, sentence) for sentence in l]
max_target_len = max([len(indexes) for indexes in indexes_batch])
padList = zeroPadding(indexes_batch)
mask = binaryMatrix(padList)
mask = torch.ByteTensor(mask)
padVar = torch.LongTensor(padList) # not torch.longTensor
# padVar = torch.tensor(padList, dtype=torch.long)
return padVar, mask, max_target_len
#
# # Returns all items for a given batch of pairs
def batch2TrainData(voc, pair_batch):
pair_batch.sort(key=lambda x: len(x[0].split(' ')), reverse=True)
input_batch, output_batch = [], []
for pair in pair_batch:
input_batch.append(pair[0])
output_batch.append(pair[1])
inp, lengths = inputVar(input_batch, voc) # inp: padVar(shape(max_length, batch_size))
output, mask, max_target_len = outputVar(output_batch, voc)
return inp, lengths, output, mask, max_target_len
# # testing
# small_batch_size = 5
# batches = batch2TrainData(voc, [random.choice(pairs) for _ in range(small_batch_size)])
# input_variable, lengths, target_variable, mask, max_target_len = batches
#
# print('input_variable:', input_variable)
# print('length:', lengths)
# print('target_variable:', target_variable)
# print('mask:', mask)
# print('max_target_len:', max_target_len)
#
# # EncoderRNN
class EncoderRNN(nn.Module):
def __init__(self, hidden_size, embedding, n_layers=1, dropout=0):
super(EncoderRNN, self).__init__()
self.n_layers = n_layers
self.hidden_size = hidden_size
self.embedding = embedding
# input_seq features and hidden features is hidden_size
self.gru = nn.GRU(hidden_size, hidden_size, n_layers,
dropout=(0 if n_layers == 1 else dropout), bidirectional= True)
def forward(self, input_seq, input_lengths, hidden=None):
# input_seq shape(max_length, batch) -> embedding shape(max_length, batch, hidden_size)
embedded = self.embedding(input_seq)
# Pack padded batch of sequences for RNN module
# 因为RNN(GRU)需要知道实际的长度,所以PyTorch提供了一个函数pack_padded_sequence把输入向量和长度pack
# 到一个对象PackedSequence里,这样便于使用。
packed = torch.nn.utils.rnn.pack_padded_sequence(embedded, input_lengths)
# 通过GRU进行forward计算,需要传入输入和隐变量
# 如果传入的输入是一个Tensor (max_length, batch, hidden_size)
# 那么输出outputs是(max_length, batch, hidden_size*num_directions)。
# 第三维是hidden_size和num_directions的混合,它们实际排列顺序是num_directions在前面,因此我们可以
# 使用outputs.view(seq_len, batch, num_directions, hidden_size)得到4维的向量。
# 其中第三维是方向,第四位是隐状态。
# 而如果输入是PackedSequence对象,那么输出outputs也是一个PackedSequence对象,我们需要用
# 函数pad_packed_sequence把它变成一个shape为(max_length, batch, hidden*num_directions)的向量以及
# 一个list,表示输出的长度,当然这个list和输入的input_lengths完全一样,因此通常我们不需要它。
# hidden: updated hidden state from GRU; shape=(n_layers x num_directions, batch_size, hidden_size)
outputs, hidden = self.gru(packed, hidden) # hidden shape(n_layers x num_directions, batch_size, hidden_size)里的所有初始值设为None
# 参考前面的注释,我们得到outputs为(max_length, batch, hidden*num_directions)
outputs, _ = torch.nn.utils.rnn.pad_packed_sequence(outputs)
# 我们需要把输出的num_directions双向的向量加起来
# 因为outputs的第三维是先放前向的hidden_size个结果,然后再放后向的hidden_size个结果
# 所以outputs[:, :, :self.hidden_size]得到前向的结果
# outputs[:, :, self.hidden_size:]是后向的结果
# 注意,如果bidirectional是False,则outputs第三维的大小就是hidden_size,
# 这时outputs[:, : ,self.hidden_size:]是不存在的,因此也不会加上去。
outputs = outputs[:, :, :self.hidden_size] + outputs[:, :, self.hidden_size:]
# 返回最终的输出和最后时刻的隐状态。
return outputs, hidden
#
# # Luong attention layer
class Attn(torch.nn.Module):
def __init__(self, method, hidden_size):
super(Attn, self).__init__()
self.method = method
if self.method not in ['dot', 'general', 'concat']:
raise ValueError(self.method, 'is not an appropriate attention method.')
self.hidden_size = hidden_size
if self.method == 'general':
self.attn = torch.nn.Linear(self.hidden_size, hidden_size)
elif self.method == 'concat':
self.attn = torch.nn.Linear(self.hidden_size * 2, hidden_size)
self.v = torch.nn.Parameter(torch.FloatTensor(hidden_size))
def dot_score(self, hidden, encoder_output):
# 输入hidden的shape是(1, batch=64, hidden_size=500)
# encoder_outputs的shape是(input_lengths=10, batch=64, hidden_size=500)
# hidden * encoder_output得到的shape是(10, 64, 500),然后对第3维求和就可以计算出score。
return torch.sum(hidden * encoder_output, dim=2)
def general_score(self, hidden, encoder_output):
energy = self.attn(encoder_output)
return torch.sum(hidden * energy, dim=2)
def concat_score(self, hidden, encoder_output):
energy = self.attn(torch.cat((hidden.expand(encoder_output.size(0), -1, -1), encoder_output), 2)).tanh()
return torch.sum(self.v * energy, dim=2)
# 输入是上一个时刻的隐状态hidden和所有时刻的Encoder的输出encoder_outputs
# 输出是注意力的概率,也就是长度为input_lengths的向量,它的和加起来是1。
def forward(self, hidden, encoder_outputs):
# 假设
# 计算注意力的score,输入hidden的shape是(1, batch=64, hidden_size=500),
# 表示t时刻batch数据的隐状态
# encoder_outputs的shape是(input_lengths=10, batch=64, hidden_size=500)
if self.method == 'general':
attn_energies = self.general_score(hidden, encoder_outputs)
elif self.method == 'concat':
attn_energies = self.concat_score(hidden, encoder_outputs)
elif self.method == 'dot':
attn_energies = self.dot_score(hidden, encoder_outputs)
# Transpose max_length and batch_size dimensions
# attn_energies shape: sum((1, 64, 500) * (10, 64, 500), dim-2) -> (10, 64)
# 把attn_energies从(max_length=10, batch=64)转置成(64, 10)
attn_energies = attn_energies.t()
# 使用softmax函数把score变成概率,shape仍然是(64, 10),然后用unsqueeze(1)变成
# (64, 1, 10)
return F.softmax(attn_energies, dim=1).unsqueeze(1)
#
# #
class LuongAttnDecoderRNN(nn.Module):
def __init__(self, attn_model, embedding, hidden_size, output_size, n_layers=1, dropout=0.1):
super(LuongAttnDecoderRNN, self).__init__()
self.attn_model = attn_model
self.hidden_size = hidden_size
self.output_size = output_size # output_size: voc.num_words
self.n_layers = n_layers
self.dropout = dropout
# 定义Decoder layers
self.embedding = embedding
self.embedding_dropout = nn.Dropout(dropout)
self.gru = nn.GRU(hidden_size, hidden_size, n_layers, dropout=(0 if n_layers==1 else dropout))
self.concat = nn.Linear(hidden_size * 2, hidden_size)
self.out = nn.Linear(hidden_size, output_size)
self.attn = Attn(attn_model, hidden_size)
def forward(self, input_step, last_hidden, encoder_outputs):
# 注意:decoder每一步只能处理一个时刻的数据,因为t时刻计算完了才能计算t+1时刻。
# input_step的shape是(1, 64),64是batch,0-dim的1是当前输入的词ID(来自上一个时刻的输出)
# 通过embedding层变成(1, 64, 500),然后进行dropout,shape不变。
embedded = self.embedding(input_step)
embedded = self.embedding_dropout(embedded)
# 把embedded传入GRU进行forward计算
# 得到rnn_output的shape(seq_len, batch_size, hidden_size)是(1, 64, 500), rnn_output: 每一batch的decoder解
# hidden shape(n_layers x num_directions, batch_size, hidden_size)
# hidden是(2, 64, 500),因为是单向GRU两层layer,所以第一维是2。
rnn_output, hidden = self.gru(embedded, last_hidden)
# 计算注意力权重, 根据前面的分析,attn_weights的shape是(64, 1, 10)
attn_weights = self.attn(rnn_output, encoder_outputs)
# encoder_outputs是(10, 64, 500)
# encoder_outputs.transpose(0, 1)后的shape是(64, 10, 500)
# attn_weights.bmm后是(64, 1, 500)
# bmm(batch matrix multiplication)是批量的矩阵乘法,第一维是batch,我们可以把attn_weights看成64个(1,10)的矩阵
# 把encoder_outputs.transpose(0, 1)看成64个(10, 500)的矩阵
# 那么bmm就是64个(1, 10)矩阵 x (10, 500)矩阵,最终得到(64, 1, 500)
context = attn_weights.bmm(encoder_outputs.transpose(0, 1))
# 把context向量和GRU的输出拼接起来
# rnn_output从(1, 64, 500)变成(64, 500)
rnn_output = rnn_output.squeeze(0)
# context从(64, 1, 500)变成(64, 500)
context = context.squeeze(1)
# 拼接得到(64, 1000)
concat_input = torch.cat((rnn_output, context), 1)
# self.concat是一个矩阵(1000, 500),
# self.concat(concat_input)的输出是(64, 500)
# 然后用tanh把输出返回变成(-1,1),concat_output的shape是(64, 500)
concat_output =torch.tanh(self.concat(concat_input))
# out是(500, 词典大小=7826)
# output shape(batch=64, output_size=voc.num_words),0-dim是词ID
output = self.out(concat_output)
# 用softmax变成概率,表示当前时刻输出每个词的概率。
output = F.softmax(output, dim=1)
# 返回output和新的hidden
return output, hidden
#
# # mask_loss, nTotal = maskNLLLoss(decoder_output, target_variable[t], mask[t])
# # decoder_output shape(batch=64, output_size=voc.num_words),0-dim是词ID
# # 对所有batch的target_length单独一步求平均loss
def maskNLLLoss(inp, target, mask):
# target torch.size([64])
# mask torch.size([64])
# 计算实际的词的个数,因为padding是0,非padding是1,因此sum就可以得到词的个数
nTotal = mask.sum()
# torch.gather: https://blog.csdn.net/edogawachia/article/details/80515038
# torch.gather(inp, 1, target.view(-1, 1)) shape(64, 1)
# torch.gather(inp, 1, target.view(-1, 1)).squeeze(1) torch.size([64])
# 对于inp:decoder_output(bathch_size, voc.num_words) 取target[t]的ID对应的概率,做crossEntropy
crossEntropy = -torch.log(torch.gather(inp, 1, target.view(-1, 1)).squeeze(1))
# 对于crossEntropy保留mask[t]中存在的loss,并取平均loss
loss = crossEntropy.masked_select(mask).mean()
loss = loss.to(device)
return loss, nTotal.item()
def train(input_variable, lengths, target_variable, mask, max_target_len, encoder, decoder, embedding,
encoder_optimizer, decoder_optimizer, batch_size, clip, teacher_forcing_ratio, max_length=MAX_LENGTH):
# 梯度清空
encoder_optimizer.zero_grad()
encoder_optimizer.zero_grad()
# set device
input_variable = input_variable.to(device)
lengths = lengths.to(device)
target_variable = target_variable.to(device)
mask = mask.to(device)
# initialization
loss = 0
print_losses = []
n_totals = 0
# encoder Forwarding
# encoder_outputs: shape(max_length, batch_size, hidden_size)
# encoder_hidden: shape=(n_layers x num_directions, batch_size, hidden_size)
encoder_outputs, encoder_hidden = encoder(input_variable, lengths)
# Decoder的初始输入是SOS,我们需要构造(1, batch)的输入,表示第一个时刻batch个输入。
decoder_input = torch.LongTensor([[SOS_token for _ in range(batch_size)]])
decoder_input = decoder_input.to(device)
# Set initial decoder hidden state to the encoder's final hidden state
decoder_hidden = encoder_hidden[:decoder.n_layers]
# 确定是否teacher_forcing
use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False
# 一次处理一个时刻
if use_teacher_forcing:
for t in range(max_target_len):
decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden, encoder_outputs)
# Teacher forcing: 下一个时刻的输入是当前正确答案
decoder_input = target_variable[t].view(1, -1) # view()将取出的.[t] size([...])即list改为size([[...]])即matrix,shape(1, batch_size)
#计算累计的loss, mask_loss: nTotal平均loss
mask_loss, nTotal = maskNLLLoss(decoder_output, target_variable[t], mask[t])
loss += mask_loss
print_losses.append(mask_loss.item() * nTotal)
n_totals += nTotal
else:
for t in range(max_target_len):
decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden, encoder_outputs)
# 不是teacher forcing: 下一个时刻的输入是当前模型预测概率最高的值,topi: 概率对应的词ID
# _, topi = decoder_output.topK(1)
# decoder_input = torch.LongTensor([[topi[i][0] for i in range(batch_size)]])
# solve error 'Tensor' object has no attribute 'topK'
_, topi = decoder_output.max(1)
decoder_input = torch.LongTensor([[topi[i] for i in range(batch_size)]])
decoder_input =decoder_input.to(device)
#计算累计的loss, mask_loss: nTotal的平均loss
mask_loss, nTotal = maskNLLLoss(decoder_output, target_variable[t], mask[t])
loss += mask_loss
print_losses.append(mask_loss.item() * nTotal)
n_totals += nTotal
# 反向计算
loss.backward()
# 对encoder和decoder进行梯度裁剪
_ = torch.nn.utils.clip_grad_norm_(encoder.parameters(), clip)
_ = torch.nn.utils.clip_grad_norm_(decoder.parameters(), clip)
# 更新参数
encoder_optimizer.step()
decoder_optimizer.step()
return sum(print_losses) / n_totals
def trainIters(model_name, voc, pairs, encoder, decoder, encoder_optimizer, decoder_optimizer,
embedding, encoder_n_layers, decoder_n_layers, save_dir, n_iteration, batch_size, print_every,
save_every, clip, corpus_name, loadFilename, hidden_size, teacher_forcing_ratio):
# 随机选择n_iteration个batch的数据(pair)
training_batches = [batch2TrainData(voc, [random.choice(pairs) for _ in range(batch_size)])
for _ in range(n_iteration)]
# 初始化
print('Initializing...')
start_iteration = 1
print_loss = 0
if loadFilename:
start_iteration = checkpoint['iteration'] + 1
# 训练
print('Training...')
for iteration in range(start_iteration, n_iteration + 1):
training_batch = training_batches[iteration - 1]
input_variable, lengths, target_variable, mask, max_target_len = training_batch
# 训练一个batch的数据
loss = train(input_variable, lengths, target_variable, mask, max_target_len, encoder,
decoder, embedding, encoder_optimizer, decoder_optimizer, batch_size, clip, teacher_forcing_ratio)
print_loss += loss
# 进度
if iteration % print_every == 0:
print_loss_avg = print_loss / print_every
print('Iteration: {}; Percent complete: {:.1f}%; Average loss: {:.4f}'.format(
iteration, iteration / n_iteration * 100, print_loss_avg))
print_loss = 0
# 保存checkpoint
if(iteration % save_every == 0):
directory = os.path.join(save_dir, model_name, corpus_name, '{}-{}_{}'.format(
encoder_n_layers, decoder_n_layers, hidden_size))
if not os.path.exists(directory):
os.makedirs(directory)
torch.save({
'iteration': iteration,
'en': encoder.state_dict(),
'de': decoder.state_dict(),
'en_opt': encoder_optimizer.state_dict(),
'de_opt': decoder_optimizer.state_dict(),
'loss': loss,
'voc_dict': voc.__dict__,
'embedding': embedding.state_dict()
}, os.path.join(directory, '{}_{}.tar'.format(iteration, 'checkpoint')))
#
# # 贪心解码
class GreedySearchDecoder(nn.Module):
def __init__(self, encoder, decoder):
super(GreedySearchDecoder, self).__init__()
self.encoder = encoder
self.decoder = decoder
def forward(self, input_seq, input_length, max_length):
# Encoder的Forward计算
encoder_outputs, encoder_hidden = self.encoder(input_seq, input_length)
# 把Encoder的encoder_hidden(n_layers*num_directions, batch, hidden_size)最后时刻得双向隐状态作为Decoder的初始值
decoder_hidden = encoder_hidden[:self.decoder.n_layers]
# 因为我们的函数都是要求(time,batch),因此即使只有一个数据,也要做出二维的。
# Decoder的初始输入是SOS
decoder_input = torch.ones(1, 1, device=device, dtype=torch.long) * SOS_token
# 用于保存解码结果的tensor
all_tokens = torch.zeros([0], device=device, dtype=torch.long)
all_scores = torch.zeros([0], device=device)
# 循环,这里只使用长度限制,后面处理的时候把EOS去掉了。
for _ in range(max_length):
# Decoder forward一步
decoder_output, decoder_hidden = self.decoder(decoder_input, decoder_hidden,encoder_outputs)
# decoder_outputs是(batch=1, vob_size)
# 使用max返回概率最大的得分和词ID
decoder_scores, decoder_input = torch.max(decoder_output, dim=1)
# 把解码结果保存到all_tokens和all_scores里
all_tokens = torch.cat((all_tokens, decoder_input), dim=0)
all_scores = torch.cat((all_scores, decoder_scores), dim=0)
# decoder_input是当前时刻输出的词的ID,这是个一维的向量,因为max会减少一维。
# 但是decoder要求有一个batch维度,因此用unsqueeze增加batch维度。decoder_input shape(input_step=1(即decoder_input的ID维度), batch_size=1)
decoder_input = torch.unsqueeze(decoder_input, 0)
# 返回所有的词和得分。
return all_tokens, all_scores
def evaluate(encoder, decoder, searcher, voc, sentence, max_length=MAX_LENGTH):
# 输入的一个batch句子变成id
indexes_batch = [indexesFromSentence(voc, sentence)] # shape(batch, lengths)
# 创建lengths tensor
lengths = torch.tensor([len(indexes) for indexes in indexes_batch])
# 转置
input_batch = torch.LongTensor(indexes_batch).transpose(0, 1) # shape(lengths=len(sentence), batch=1)
# to GPU
input_batch = input_batch.to(device)
lengths = lengths.to(device)
# use GreedySearchDecoder
tokens, scores = searcher(input_batch, lengths, max_length)
# ID变成词
decoded_words = [voc.index2word[token.item()] for token in tokens]
return decoded_words
# Turn a Unicode string to plain ASCII, thanks to
# https://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
return ''.join(
c for c in unicodedata.normalize('NFD', s)
if unicodedata.category(c) != 'Mn'
)
def normalizeString(s):
s = unicodeToAscii(s.lower().strip())
s = re.sub(r'([.!?])', r' \1', s) # 标点前增加空格
s = re.sub(r'[^a-zA-Z.!?]+', r' ', s) # 字母和标点之外变成空格
s = re.sub(r'\s+', r' ', s).strip() # 由上可能导致多个空格,把多个空格变为一个空格并去首尾空格
return s
def evaluateInput(encoder, decoder, searcher, voc):
input_sentence = ''
while(1):
try:
input_sentence = input('> ')
if input_sentence == 'q' or input_sentence == 'quit': break
# input_sentence normalization
input_sentence = normalizeString(input_sentence)
# print(input_sentence)
# gen evaluate sentence
output_words = evaluate(encoder, decoder, searcher, voc, input_sentence)
# remove 'EOS' and 'PAD'
words = []
for word in output_words:
if word == 'EOS':
break
elif word != 'PAD':
words.append(word)
print('Bot:', ' '.join(words))
except KeyError:
print('Error: Encountered unknown word.')
'''
model shape总变化
1. (batch_size, max_length)
2. (max_length, batch_size) zeroPadding
3. (max_length, batch_size, hidden_size) embedding(voc.num_words, hidden_size)
4. (max_length, batch_size, hidden_size*num_directions) encoder_outputs(bidirectional)
5. (max_length, batch_size, hidden_size) sum(forward-direction, backward-direction)
(n_layers*num_directions, batch_size, hidden_size) last_hidden
'''