#55 Merge outputs as feature

* #55 Merge outputs as feature * Update README.md
CyberZHG · May 11, 2019 · 2a4a981 · 2a4a981
1 parent 9db7ad2
commit 2a4a981
Show file tree

Hide file tree

Showing 6 changed files with 167 additions and 23 deletions.
diff --git a/README.md b/README.md
@@ -107,7 +107,7 @@ model.fit_generator(
 
 
 # Use the trained model
-inputs, output_layer = get_model(  # `output_layer` is the last feature extraction layer (the last transformer)
+inputs, output_layer = get_model(
     token_num=len(token_dict),
     head_num=5,
     transformer_num=12,
@@ -116,25 +116,9 @@ inputs, output_layer = get_model(  # `output_layer` is the last feature extracti
     seq_len=20,
     pos_num=20,
     dropout_rate=0.05,
-    training=False,   # The input layers and output layer will be returned if `training` is `False`
-    trainable=False,  # Whether the model is trainable. The default value is the same with `training`
-)
-```
-
-### Custom Feature Extraction
-
-```python
-def _custom_layers(x, trainable=True):
-    return keras.layers.LSTM(
-        units=768,
-        trainable=trainable,
-        return_sequences=True,
-        name='LSTM',
-    )(x)
-
-model = get_model(
-    token_num=200,
-    embed_dim=768,
-    custom_layers=_custom_layers,
+    training=False,      # The input layers and output layer will be returned if `training` is `False`
+    trainable=False,     # Whether the model is trainable. The default value is the same with `training`
+    output_layer_num=4,  # The number of layers whose outputs will be concatenated as a single output.
+                           Only available when `training` is `False`.
 )
 ```
diff --git a/README.zh-CN.md b/README.zh-CN.md
@@ -0,0 +1,128 @@
+# Keras BERT
+
+[![Travis](https://travis-ci.org/CyberZHG/keras-bert.svg)](https://travis-ci.org/CyberZHG/keras-bert)
+[![Coverage](https://coveralls.io/repos/github/CyberZHG/keras-bert/badge.svg?branch=master)](https://coveralls.io/github/CyberZHG/keras-bert)
+[![FOSSA Status](https://app.fossa.com/api/projects/git%2Bgithub.com%2FCyberZHG%2Fkeras-bert.svg?type=shield)](https://app.fossa.com/projects/git%2Bgithub.com%2FCyberZHG%2Fkeras-bert?ref=badge_shield)
+
+[BERT](https://arxiv.org/pdf/1810.04805.pdf)的非官方实现，可以加载官方的预训练模型进行特征提取和预测。
+
+## 安装
+
+```bash
+pip install keras-bert
+```
+
+## 使用
+
+### 官方模型使用
+
+[特征提取展示](./demo/load_model/load_and_extract.py)中使用官方预训练好的`chinese_L-12_H-768_A-12`可以得到和官方工具一样的结果。
+
+[预测展示](./demo/load_model/load_and_predict.py)中可以填补出缺失词并预测是否是上下文。
+
+
+### 分词
+
+`Tokenizer`类可以用来进行分词工作，包括归一化和英文部分的最大贪心匹配等，在CJK字符集内的中文会以单字分隔。
+
+```python
+from keras_bert import Tokenizer
+
+token_dict = {
+    '[CLS]': 0,
+    '[SEP]': 1,
+    'un': 2,
+    '##aff': 3,
+    '##able': 4,
+    '[UNK]': 5,
+}
+tokenizer = Tokenizer(token_dict)
+print(tokenizer.tokenize('unaffable'))  # 分词结果是：`['[CLS]', 'un', '##aff', '##able', '[SEP]']`
+
+indices, segments = tokenizer.encode('unaffable')
+print(indices)   # 词对应的下标：`[0, 2, 3, 4, 1]`
+print(segments)  # 段落对应下标：`[0, 0, 0, 0, 0]`
+
+print(tokenizer.tokenize(first='unaffable', second='钢'))
+# 分词结果是：`['[CLS]', 'un', '##aff', '##able', '[SEP]', '钢', '[SEP]']`
+indices, segments = tokenizer.encode(first='unaffable', second='钢', max_len=10)
+print(indices)   # 词对应的下标：`[0, 2, 3, 4, 1, 5, 1, 0, 0, 0]`
+print(segments)  # 段落对应下标：`[0, 0, 0, 0, 0, 1, 1, 1, 1, 1]`
+```
+
+### 训练和使用
+
+训练过程推荐使用官方的代码。这个代码库内包含一个的训练过程，`training`为`True`的情况下使用的是带warmup的Adam优化器：
+
+```python
+from keras_bert import get_base_dict, get_model, gen_batch_inputs
+
+
+# 随便的输入样例：
+sentence_pairs = [
+    [['all', 'work', 'and', 'no', 'play'], ['makes', 'jack', 'a', 'dull', 'boy']],
+    [['from', 'the', 'day', 'forth'], ['my', 'arm', 'changed']],
+    [['and', 'a', 'voice', 'echoed'], ['power', 'give', 'me', 'more', 'power']],
+]
+
+
+# 构建自定义词典
+token_dict = get_base_dict()  # 初始化特殊符号，如`[CLS]`
+for pairs in sentence_pairs:
+    for token in pairs[0] + pairs[1]:
+        if token not in token_dict:
+            token_dict[token] = len(token_dict)
+token_list = list(token_dict.keys())  # Used for selecting a random word
+
+
+# 构建和训练模型
+model = get_model(
+    token_num=len(token_dict),
+    head_num=5,
+    transformer_num=12,
+    embed_dim=25,
+    feed_forward_dim=100,
+    seq_len=20,
+    pos_num=20,
+    dropout_rate=0.05,
+)
+model.summary()
+
+def _generator():
+    while True:
+        yield gen_batch_inputs(
+            sentence_pairs,
+            token_dict,
+            token_list,
+            seq_len=20,
+            mask_rate=0.3,
+            swap_sentence_rate=1.0,
+        )
+
+model.fit_generator(
+    generator=_generator(),
+    steps_per_epoch=1000,
+    epochs=100,
+    validation_data=_generator(),
+    validation_steps=100,
+    callbacks=[
+        keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)
+    ],
+)
+
+
+# 使用训练好的模型
+inputs, output_layer = get_model(
+    token_num=len(token_dict),
+    head_num=5,
+    transformer_num=12,
+    embed_dim=25,
+    feed_forward_dim=100,
+    seq_len=20,
+    pos_num=20,
+    dropout_rate=0.05,
+    training=False,      # 当`training`是`False`，返回值是输入和输出
+    trainable=False,     # 模型是否可训练，默认值和`training`相同
+    output_layer_num=4,  # 最后几层的输出将合并在一起作为最终的输出，只有当`training`是`False`有效
+)
+```
diff --git a/keras_bert/bert.py b/keras_bert/bert.py
@@ -43,6 +43,7 @@ def get_model(token_num,
               custom_layers=None,
               training=True,
               trainable=None,
+              output_layer_num=1,
               decay_steps=100000,
               warmup_steps=10000,
               lr=1e-4):
@@ -67,6 +68,8 @@ def get_model(token_num,
     :param training: The built model will be returned if it is `True`, otherwise the input layers and the last feature
                      extraction layer will be returned.
     :param trainable: Whether the model is trainable.
+    :param output_layer_num: The number of layers whose outputs will be concatenated as a single output.
+                             Only available when `training` is `False`.
     :param decay_steps: Learning rate will decay linearly to zero in decay steps.
     :param warmup_steps: Learning rate will increase linearly to lr in first warmup steps.
     :param lr: Learning rate.
@@ -101,6 +104,15 @@ def get_model(token_num,
             trainable=trainable,
         )
     if not training:
+        if output_layer_num > 1:
+            if output_layer_num > transformer_num:
+                output_layer_num = transformer_num
+            model = keras.models.Model(inputs=inputs[:2], outputs=transformed)
+            outputs = []
+            for i in range(output_layer_num):
+                layer = model.get_layer(name='Encoder-{}-FeedForward-Norm'.format(transformer_num - i))
+                outputs.append(layer.output)
+            transformed = keras.layers.Concatenate(name='Encoder-Output')(list(reversed(outputs)))
         return inputs[:2], transformed
     mlm_dense_layer = keras.layers.Dense(
         units=embed_dim,

diff --git a/keras_bert/loader.py b/keras_bert/loader.py
@@ -21,12 +21,15 @@ def _loader(name):
 def build_model_from_config(config_file,
                             training=False,
                             trainable=None,
+                            output_layer_num=1,
                             seq_len=None):
     """Build the model from config file.
 
     :param config_file: The path to the JSON configuration file.
     :param training: If training, the whole model will be returned.
     :param trainable: Whether the model is trainable.
+    :param output_layer_num: The number of layers whose outputs will be concatenated as a single output.
+                             Only available when `training` is `False`.
     :param seq_len: If it is not None and it is shorter than the value in the config file, the weights in
                     position embeddings will be sliced to fit the new length.
     :return: model and config
@@ -47,6 +50,7 @@ def build_model_from_config(config_file,
         feed_forward_dim=config['intermediate_size'],
         training=training,
         trainable=trainable,
+        output_layer_num=output_layer_num,
     )
     if not training:
         inputs, outputs = model
@@ -140,6 +144,7 @@ def load_trained_model_from_checkpoint(config_file,
                                        checkpoint_file,
                                        training=False,
                                        trainable=None,
+                                       output_layer_num=1,
                                        seq_len=None):
     """Load trained official model from checkpoint.
 
@@ -148,10 +153,18 @@ def load_trained_model_from_checkpoint(config_file,
     :param training: If training, the whole model will be returned.
                      Otherwise, the MLM and NSP parts will be ignored.
     :param trainable: Whether the model is trainable. The default value is the same with `training`.
+    :param output_layer_num: The number of layers whose outputs will be concatenated as a single output.
+                             Only available when `training` is `False`.
     :param seq_len: If it is not None and it is shorter than the value in the config file, the weights in
                     position embeddings will be sliced to fit the new length.
     :return: model
     """
-    model, config = build_model_from_config(config_file, training=training, trainable=trainable, seq_len=seq_len)
+    model, config = build_model_from_config(
+        config_file,
+        training=training,
+        trainable=trainable,
+        output_layer_num=output_layer_num,
+        seq_len=seq_len,
+    )
     load_model_weights_from_checkpoint(model, config, checkpoint_file, training=training)
     return model
diff --git a/setup.py b/setup.py
@@ -11,7 +11,7 @@
 
 setup(
     name='keras-bert',
-    version='0.41.0',
+    version='0.42.0',
     packages=find_packages(),
     url='https://github.com/CyberZHG/keras-bert',
     license='MIT',

diff --git a/tests/test_loader.py b/tests/test_loader.py
@@ -25,3 +25,10 @@ def test_load_training(self):
         model_path = os.path.join(current_path, 'test_checkpoint', 'model.ckpt-20')
         model = load_trained_model_from_checkpoint(config_path, model_path, training=True)
         model.summary()
+
+    def test_load_output_layer_num(self):
+        current_path = os.path.dirname(os.path.abspath(__file__))
+        config_path = os.path.join(current_path, 'test_checkpoint', 'config.json')
+        model_path = os.path.join(current_path, 'test_checkpoint', 'model.ckpt-20')
+        model = load_trained_model_from_checkpoint(config_path, model_path, training=False, output_layer_num=4)
+        model.summary()